diff --git a/docs/source/developers/index.md b/docs/source/developers/index.md
index c9062ad1..aac92328 100644
--- a/docs/source/developers/index.md
+++ b/docs/source/developers/index.md
@@ -150,6 +150,84 @@ my-provider = "my_provider:MyEmbeddingsProvider"
 
 [Embeddings]: https://api.python.langchain.com/en/stable/embeddings/langchain_core.embeddings.Embeddings.html
 
+
+### Custom completion providers
+
+Any model provider derived from `BaseProvider` can be used as a completion provider.
+However, some providers may benefit from customizing handling of completion requests.
+
+There are two asynchronous methods which can be overridden in subclasses of `BaseProvider`:
+- `generate_inline_completions`: takes a request (`InlineCompletionRequest`) and returns `InlineCompletionReply`
+- `stream_inline_completions`: takes a request and yields an initiating reply (`InlineCompletionReply`) with `isIncomplete` set to `True` followed by subsequent chunks (`InlineCompletionStreamChunk`)
+
+When streaming all replies and chunks for given invocation of the `stream_inline_completions()` method should include a constant and unique string token identifying the stream. All chunks except for the last chunk for a given item should have the `done` value set to `False`.
+
+The following example demonstrates a custom implementation of the completion provider with both a method for sending multiple completions in one go, and streaming multiple completions concurrently.
+The implementation and explanation for the `merge_iterators` function used in this example can be found [here](https://stackoverflow.com/q/72445371/4877269).
+
+```python
+class MyCompletionProvider(BaseProvider, FakeListLLM):
+    id = "my_provider"
+    name = "My Provider"
+    model_id_key = "model"
+    models = ["model_a"]
+
+    def __init__(self, **kwargs):
+        kwargs["responses"] = ["This fake response will not be used for completion"]
+        super().__init__(**kwargs)
+
+    async def generate_inline_completions(self, request: InlineCompletionRequest):
+        return InlineCompletionReply(
+            list=InlineCompletionList(items=[
+                {"insertText": "An ant minding its own business"},
+                {"insertText": "A bug searching for a snack"}
+            ]),
+            reply_to=request.number,
+        )
+
+    async def stream_inline_completions(self, request: InlineCompletionRequest):
+        token_1 = f"t{request.number}s0"
+        token_2 = f"t{request.number}s1"
+
+        yield InlineCompletionReply(
+            list=InlineCompletionList(
+                items=[
+                    {"insertText": "An ", "isIncomplete": True, "token": token_1},
+                    {"insertText": "", "isIncomplete": True, "token": token_2}
+                ]
+            ),
+            reply_to=request.number,
+        )
+
+        # where merge_iterators
+        async for reply in merge_iterators([
+            self._stream("elephant dancing in the rain", request.number, token_1, start_with="An"),
+            self._stream("A flock of birds flying around a mountain", request.number, token_2)
+        ]):
+            yield reply
+
+    async def _stream(self, sentence, request_number, token, start_with = ""):
+        suggestion = start_with
+
+        for fragment in sentence.split():
+            await asyncio.sleep(0.75)
+            suggestion += " " + fragment
+            yield InlineCompletionStreamChunk(
+                type="stream",
+                response={"insertText": suggestion, "token": token},
+                reply_to=request_number,
+                done=False
+            )
+
+        # finally, send a message confirming that we are done
+        yield InlineCompletionStreamChunk(
+            type="stream",
+            response={"insertText": suggestion, "token": token},
+            reply_to=request_number,
+            done=True,
+        )
+```
+
 ## Prompt templates
 
 Each provider can define **prompt templates** for each supported format. A prompt
diff --git a/packages/jupyter-ai-magics/jupyter_ai_magics/completion_utils.py b/packages/jupyter-ai-magics/jupyter_ai_magics/completion_utils.py
new file mode 100644
index 00000000..204da5e7
--- /dev/null
+++ b/packages/jupyter-ai-magics/jupyter_ai_magics/completion_utils.py
@@ -0,0 +1,52 @@
+from typing import Dict
+
+from .models.completion import InlineCompletionRequest
+
+
+def token_from_request(request: InlineCompletionRequest, suggestion: int):
+    """Generate a deterministic token (for matching streamed messages)
+    using request number and suggestion number"""
+    return f"t{request.number}s{suggestion}"
+
+
+def template_inputs_from_request(request: InlineCompletionRequest) -> Dict:
+    suffix = request.suffix.strip()
+    filename = request.path.split("/")[-1] if request.path else "untitled"
+
+    return {
+        "prefix": request.prefix,
+        "suffix": suffix,
+        "language": request.language,
+        "filename": filename,
+        "stop": ["\n```"],
+    }
+
+
+def post_process_suggestion(suggestion: str, request: InlineCompletionRequest) -> str:
+    """Remove spurious fragments from the suggestion.
+
+    While most models (especially instruct and infill models do not require
+    any pre-processing, some models such as gpt-4 which only have chat APIs
+    may require removing spurious fragments. This function uses heuristics
+    and request data to remove such fragments.
+    """
+    # gpt-4 tends to add "```python" or similar
+    language = request.language or "python"
+    markdown_identifiers = {"ipython": ["ipython", "python", "py"]}
+    bad_openings = [
+        f"```{identifier}"
+        for identifier in markdown_identifiers.get(language, [language])
+    ] + ["```"]
+    for opening in bad_openings:
+        if suggestion.startswith(opening):
+            suggestion = suggestion[len(opening) :].lstrip()
+            # check for the prefix inclusion (only if there was a bad opening)
+            if suggestion.startswith(request.prefix):
+                suggestion = suggestion[len(request.prefix) :]
+            break
+
+    # check if the suggestion ends with a closing markdown identifier and remove it
+    if suggestion.rstrip().endswith("```"):
+        suggestion = suggestion.rstrip()[:-3].rstrip()
+
+    return suggestion
diff --git a/packages/jupyter-ai-magics/jupyter_ai_magics/models/completion.py b/packages/jupyter-ai-magics/jupyter_ai_magics/models/completion.py
new file mode 100644
index 00000000..147f6cee
--- /dev/null
+++ b/packages/jupyter-ai-magics/jupyter_ai_magics/models/completion.py
@@ -0,0 +1,81 @@
+from typing import List, Literal, Optional
+
+from langchain.pydantic_v1 import BaseModel
+
+
+class InlineCompletionRequest(BaseModel):
+    """Message send by client to request inline completions.
+
+    Prefix/suffix implementation is used to avoid the need for synchronising
+    the notebook state at every key press (subject to change in future)."""
+
+    # unique message ID generated by the client used to identify replies and
+    # to easily discard replies for older requests
+    number: int
+    # prefix should include full text of the current cell preceding the cursor
+    prefix: str
+    # suffix should include full text of the current cell preceding the cursor
+    suffix: str
+    # media type for the current language, e.g. `text/x-python`
+    mime: str
+    # whether to stream the response (if supported by the model)
+    stream: bool
+    # path to the notebook of file for which the completions are generated
+    path: Optional[str]
+    # language inferred from the document mime type (if possible)
+    language: Optional[str]
+    # identifier of the cell for which the completions are generated if in a notebook
+    # previous cells and following cells can be used to learn the wider context
+    cell_id: Optional[str]
+
+
+class InlineCompletionItem(BaseModel):
+    """The inline completion suggestion to be displayed on the frontend.
+
+    See JupyterLab `InlineCompletionItem` documentation for the details.
+    """
+
+    insertText: str
+    filterText: Optional[str]
+    isIncomplete: Optional[bool]
+    token: Optional[str]
+
+
+class CompletionError(BaseModel):
+    type: str
+    traceback: str
+
+
+class InlineCompletionList(BaseModel):
+    """Reflection of JupyterLab's `IInlineCompletionList`."""
+
+    items: List[InlineCompletionItem]
+
+
+class InlineCompletionReply(BaseModel):
+    """Message sent from model to client with the infill suggestions"""
+
+    list: InlineCompletionList
+    # number of request for which we are replying
+    reply_to: int
+    error: Optional[CompletionError]
+
+
+class InlineCompletionStreamChunk(BaseModel):
+    """Message sent from model to client with the infill suggestions"""
+
+    type: Literal["stream"] = "stream"
+    response: InlineCompletionItem
+    reply_to: int
+    done: bool
+    error: Optional[CompletionError]
+
+
+__all__ = [
+    "InlineCompletionRequest",
+    "InlineCompletionItem",
+    "CompletionError",
+    "InlineCompletionList",
+    "InlineCompletionReply",
+    "InlineCompletionStreamChunk",
+]
diff --git a/packages/jupyter-ai-magics/jupyter_ai_magics/partner_providers/openai.py b/packages/jupyter-ai-magics/jupyter_ai_magics/partner_providers/openai.py
index 382a480e..a1347073 100644
--- a/packages/jupyter-ai-magics/jupyter_ai_magics/partner_providers/openai.py
+++ b/packages/jupyter-ai-magics/jupyter_ai_magics/partner_providers/openai.py
@@ -75,23 +75,17 @@ class AzureChatOpenAIProvider(BaseProvider, AzureChatOpenAI):
     id = "azure-chat-openai"
     name = "Azure OpenAI"
     models = ["*"]
-    model_id_key = "deployment_name"
+    model_id_key = "azure_deployment"
     model_id_label = "Deployment name"
     pypi_package_deps = ["langchain_openai"]
+    # Confusingly, langchain uses both OPENAI_API_KEY and AZURE_OPENAI_API_KEY for azure
+    # https://github.com/langchain-ai/langchain/blob/f2579096993ae460516a0aae1d3e09f3eb5c1772/libs/partners/openai/langchain_openai/llms/azure.py#L85
     auth_strategy = EnvAuthStrategy(name="AZURE_OPENAI_API_KEY")
     registry = True
 
     fields = [
-        TextField(
-            key="openai_api_base", label="Base API URL (required)", format="text"
-        ),
-        TextField(
-            key="openai_api_version", label="API version (required)", format="text"
-        ),
-        TextField(
-            key="openai_organization", label="Organization (optional)", format="text"
-        ),
-        TextField(key="openai_proxy", label="Proxy (optional)", format="text"),
+        TextField(key="azure_endpoint", label="Base API URL (required)", format="text"),
+        TextField(key="api_version", label="API version (required)", format="text"),
     ]
 
 
diff --git a/packages/jupyter-ai-magics/jupyter_ai_magics/providers.py b/packages/jupyter-ai-magics/jupyter_ai_magics/providers.py
index 091b78fe..3d27a486 100644
--- a/packages/jupyter-ai-magics/jupyter_ai_magics/providers.py
+++ b/packages/jupyter-ai-magics/jupyter_ai_magics/providers.py
@@ -5,7 +5,17 @@
 import io
 import json
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, ClassVar, Coroutine, Dict, List, Literal, Optional, Union
+from typing import (
+    Any,
+    AsyncIterator,
+    ClassVar,
+    Coroutine,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Union,
+)
 
 from jsonpath_ng import parse
 from langchain.chat_models.base import BaseChatModel
@@ -20,6 +30,8 @@
 )
 from langchain.pydantic_v1 import BaseModel, Extra, root_validator
 from langchain.schema import LLMResult
+from langchain.schema.output_parser import StrOutputParser
+from langchain.schema.runnable import Runnable
 from langchain.utils import get_from_dict_or_env
 from langchain_community.chat_models import (
     BedrockChat,
@@ -46,6 +58,13 @@
 except:
     from pydantic.main import ModelMetaclass
 
+from . import completion_utils as completion
+from .models.completion import (
+    InlineCompletionList,
+    InlineCompletionReply,
+    InlineCompletionRequest,
+    InlineCompletionStreamChunk,
+)
 from .models.persona import Persona
 
 CHAT_SYSTEM_PROMPT = """
@@ -405,6 +424,71 @@ def is_chat_provider(self):
     def allows_concurrency(self):
         return True
 
+    async def generate_inline_completions(
+        self, request: InlineCompletionRequest
+    ) -> InlineCompletionReply:
+        chain = self._create_completion_chain()
+        model_arguments = completion.template_inputs_from_request(request)
+        suggestion = await chain.ainvoke(input=model_arguments)
+        suggestion = completion.post_process_suggestion(suggestion, request)
+        return InlineCompletionReply(
+            list=InlineCompletionList(items=[{"insertText": suggestion}]),
+            reply_to=request.number,
+        )
+
+    async def stream_inline_completions(
+        self, request: InlineCompletionRequest
+    ) -> AsyncIterator[InlineCompletionStreamChunk]:
+        chain = self._create_completion_chain()
+        token = completion.token_from_request(request, 0)
+        model_arguments = completion.template_inputs_from_request(request)
+        suggestion = ""
+
+        # send an incomplete `InlineCompletionReply`, indicating to the
+        # client that LLM output is about to streamed across this connection.
+        yield InlineCompletionReply(
+            list=InlineCompletionList(
+                items=[
+                    {
+                        # insert text starts empty as we do not pre-generate any part
+                        "insertText": "",
+                        "isIncomplete": True,
+                        "token": token,
+                    }
+                ]
+            ),
+            reply_to=request.number,
+        )
+
+        async for fragment in chain.astream(input=model_arguments):
+            suggestion += fragment
+            if suggestion.startswith("```"):
+                if "\n" not in suggestion:
+                    # we are not ready to apply post-processing
+                    continue
+                else:
+                    suggestion = completion.post_process_suggestion(suggestion, request)
+            elif suggestion.rstrip().endswith("```"):
+                suggestion = completion.post_process_suggestion(suggestion, request)
+            yield InlineCompletionStreamChunk(
+                type="stream",
+                response={"insertText": suggestion, "token": token},
+                reply_to=request.number,
+                done=False,
+            )
+
+        # finally, send a message confirming that we are done
+        yield InlineCompletionStreamChunk(
+            type="stream",
+            response={"insertText": suggestion, "token": token},
+            reply_to=request.number,
+            done=True,
+        )
+
+    def _create_completion_chain(self) -> Runnable:
+        prompt_template = self.get_completion_prompt_template()
+        return prompt_template | self | StrOutputParser()
+
 
 class AI21Provider(BaseProvider, AI21):
     id = "ai21"
diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/help.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/help.py
index e46038da..383076c5 100644
--- a/packages/jupyter-ai/jupyter_ai/chat_handlers/help.py
+++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/help.py
@@ -62,4 +62,14 @@ def __init__(self, *args, chat_handlers: Dict[str, BaseChatHandler], **kwargs):
         self._chat_handlers = chat_handlers
 
     async def process_message(self, message: HumanChatMessage):
-        self.reply(_format_help_message(self._chat_handlers), message)
+        persona = self.config_manager.persona
+        lm_provider = self.config_manager.lm_provider
+        unsupported_slash_commands = (
+            lm_provider.unsupported_slash_commands if lm_provider else set()
+        )
+        self.reply(
+            _format_help_message(
+                self._chat_handlers, persona, unsupported_slash_commands
+            ),
+            message,
+        )
diff --git a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
index e1a22c9c..38390a44 100644
--- a/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
+++ b/packages/jupyter-ai/jupyter_ai/chat_handlers/learn.py
@@ -67,17 +67,23 @@ def __init__(self, *args, **kwargs):
 
     def _load(self):
         """Loads the vector store."""
-        embeddings = self.get_embedding_model()
-        if not embeddings:
+        if self.index is not None:
             return
-        if self.index is None:
-            try:
-                self.index = FAISS.load_local(
-                    INDEX_SAVE_DIR, embeddings, index_name=self.index_name
-                )
-                self.load_metadata()
-            except Exception as e:
-                self.log.error("Could not load vector index from disk.")
+
+        try:
+            embeddings = self.get_embedding_model()
+            if not embeddings:
+                return
+
+            self.index = FAISS.load_local(
+                INDEX_SAVE_DIR, embeddings, index_name=self.index_name
+            )
+            self.load_metadata()
+        except Exception as e:
+            self.log.error(
+                "Could not load vector index from disk. Full exception details printed below."
+            )
+            self.log.error(e)
 
     async def process_message(self, message: HumanChatMessage):
         # If no embedding provider has been selected
@@ -118,13 +124,16 @@ async def process_message(self, message: HumanChatMessage):
         if args.verbose:
             self.reply(f"Loading and splitting files for {load_path}", message)
 
-        await self.learn_dir(
-            load_path, args.chunk_size, args.chunk_overlap, args.all_files
-        )
-        self.save()
-
-        response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
-        You can ask questions about these docs by prefixing your message with **/ask**."""
+        try:
+            await self.learn_dir(
+                load_path, args.chunk_size, args.chunk_overlap, args.all_files
+            )
+        except Exception as e:
+            response = f"""Learn documents in **{load_path}** failed. {str(e)}."""
+        else:
+            self.save()
+            response = f"""🎉 I have learned documents at **{load_path}** and I am ready to answer questions about them.
+                You can ask questions about these docs by prefixing your message with **/ask**."""
         self.reply(response, message)
 
     def _build_list_response(self):
@@ -155,7 +164,6 @@ async def learn_dir(
 
         delayed = split(path, all_files, splitter=splitter)
         doc_chunks = await dask_client.compute(delayed)
-
         em_provider_cls, em_provider_args = self.get_embedding_provider()
         delayed = get_embeddings(doc_chunks, em_provider_cls, em_provider_args)
         embedding_records = await dask_client.compute(delayed)
diff --git a/packages/jupyter-ai/jupyter_ai/completions/handlers/base.py b/packages/jupyter-ai/jupyter_ai/completions/handlers/base.py
index c52c308d..9eb4f845 100644
--- a/packages/jupyter-ai/jupyter_ai/completions/handlers/base.py
+++ b/packages/jupyter-ai/jupyter_ai/completions/handlers/base.py
@@ -2,7 +2,7 @@
 import time
 import traceback
 from asyncio import AbstractEventLoop
-from typing import Any, AsyncIterator, Dict, Union
+from typing import Union
 
 import tornado
 from jupyter_ai.completions.handlers.llm_mixin import LLMHandlerMixin
@@ -14,7 +14,7 @@
     InlineCompletionStreamChunk,
 )
 from jupyter_server.base.handlers import JupyterHandler
-from langchain.pydantic_v1 import BaseModel, ValidationError
+from langchain.pydantic_v1 import ValidationError
 
 
 class BaseInlineCompletionHandler(
@@ -27,12 +27,10 @@ class BaseInlineCompletionHandler(
     ##
     # Interface for subclasses
     ##
-    async def handle_request(
-        self, message: InlineCompletionRequest
-    ) -> InlineCompletionReply:
+    async def handle_request(self, message: InlineCompletionRequest) -> None:
         """
         Handles an inline completion request, without streaming. Subclasses
-        must define this method and write a reply via `self.write_message()`.
+        must define this method and write a reply via `self.reply()`.
 
         The method definition does not need to be wrapped in a try/except block.
         """
@@ -40,14 +38,11 @@ async def handle_request(
             "The required method `self.handle_request()` is not defined by this subclass."
         )
 
-    async def handle_stream_request(
-        self, message: InlineCompletionRequest
-    ) -> AsyncIterator[InlineCompletionStreamChunk]:
+    async def handle_stream_request(self, message: InlineCompletionRequest) -> None:
         """
         Handles an inline completion request, **with streaming**.
         Implementations may optionally define this method. Implementations that
-        do so should stream replies via successive calls to
-        `self.write_message()`.
+        do so should stream replies via successive calls to `self.reply()`.
 
         The method definition does not need to be wrapped in a try/except block.
         """
@@ -64,14 +59,9 @@ async def handle_stream_request(
     def loop(self) -> AbstractEventLoop:
         return self.settings["jai_event_loop"]
 
-    def write_message(self, message: Union[bytes, str, Dict[str, Any], BaseModel]):
-        """
-        Write a bytes, string, dict, or Pydantic model object to the WebSocket
-        connection. The base definition of this method is provided by Tornado.
-        """
-        if isinstance(message, BaseModel):
-            message = message.dict()
-
+    def reply(self, reply: Union[InlineCompletionReply, InlineCompletionStreamChunk]):
+        """Write a reply object to the WebSocket connection."""
+        message = reply.dict()
         super().write_message(message)
 
     def initialize(self):
@@ -144,7 +134,7 @@ async def handle_exc(self, e: Exception, request: InlineCompletionRequest):
             title=e.args[0] if e.args else "Exception",
             traceback=traceback.format_exc(),
         )
-        self.write_message(
+        self.reply(
             InlineCompletionReply(
                 list=InlineCompletionList(items=[]),
                 error=error,
diff --git a/packages/jupyter-ai/jupyter_ai/completions/handlers/default.py b/packages/jupyter-ai/jupyter_ai/completions/handlers/default.py
index eb03df15..38676b99 100644
--- a/packages/jupyter-ai/jupyter_ai/completions/handlers/default.py
+++ b/packages/jupyter-ai/jupyter_ai/completions/handlers/default.py
@@ -1,154 +1,24 @@
-from typing import Dict, Type
-
-from jupyter_ai_magics.providers import BaseProvider
-from langchain.prompts import (
-    ChatPromptTemplate,
-    HumanMessagePromptTemplate,
-    PromptTemplate,
-    SystemMessagePromptTemplate,
-)
-from langchain.schema.output_parser import StrOutputParser
-from langchain.schema.runnable import Runnable
-
-from ..models import (
-    InlineCompletionList,
-    InlineCompletionReply,
-    InlineCompletionRequest,
-    InlineCompletionStreamChunk,
-)
+from ..models import InlineCompletionRequest
 from .base import BaseInlineCompletionHandler
 
 
 class DefaultInlineCompletionHandler(BaseInlineCompletionHandler):
-    llm_chain: Runnable
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def create_llm_chain(
-        self, provider: Type[BaseProvider], provider_params: Dict[str, str]
-    ):
-        unified_parameters = {
-            **provider_params,
-            **(self.get_model_parameters(provider, provider_params)),
-        }
-        llm = provider(**unified_parameters)
-
-        prompt_template = llm.get_completion_prompt_template()
-
-        self.llm = llm
-        self.llm_chain = prompt_template | llm | StrOutputParser()
-
-    async def handle_request(self, request: InlineCompletionRequest) -> None:
+    async def handle_request(self, request: InlineCompletionRequest):
         """Handles an inline completion request without streaming."""
-        self.get_llm_chain()
-        model_arguments = self._template_inputs_from_request(request)
-        suggestion = await self.llm_chain.ainvoke(input=model_arguments)
-        suggestion = self._post_process_suggestion(suggestion, request)
-        self.write_message(
-            InlineCompletionReply(
-                list=InlineCompletionList(items=[{"insertText": suggestion}]),
-                reply_to=request.number,
-            )
-        )
-
-    def _write_incomplete_reply(self, request: InlineCompletionRequest):
-        """Writes an incomplete `InlineCompletionReply`, indicating to the
-        client that LLM output is about to streamed across this connection.
-        Should be called first in `self.handle_stream_request()`."""
+        llm = self.get_llm()
+        if not llm:
+            raise ValueError("Please select a model for inline completion.")
 
-        token = self._token_from_request(request, 0)
-        reply = InlineCompletionReply(
-            list=InlineCompletionList(
-                items=[
-                    {
-                        # insert text starts empty as we do not pre-generate any part
-                        "insertText": "",
-                        "isIncomplete": True,
-                        "token": token,
-                    }
-                ]
-            ),
-            reply_to=request.number,
-        )
-        self.write_message(reply)
+        reply = await llm.generate_inline_completions(request)
+        self.reply(reply)
 
     async def handle_stream_request(self, request: InlineCompletionRequest):
-        # first, send empty initial reply.
-        self._write_incomplete_reply(request)
-
-        # then, generate and stream LLM output over this connection.
-        self.get_llm_chain()
-        token = self._token_from_request(request, 0)
-        model_arguments = self._template_inputs_from_request(request)
-        suggestion = ""
-
-        async for fragment in self.llm_chain.astream(input=model_arguments):
-            suggestion += fragment
-            if suggestion.startswith("```"):
-                if "\n" not in suggestion:
-                    # we are not ready to apply post-processing
-                    continue
-                else:
-                    suggestion = self._post_process_suggestion(suggestion, request)
-            self.write_message(
-                InlineCompletionStreamChunk(
-                    type="stream",
-                    response={"insertText": suggestion, "token": token},
-                    reply_to=request.number,
-                    done=False,
-                )
-            )
-
-        # finally, send a message confirming that we are done
-        self.write_message(
-            InlineCompletionStreamChunk(
-                type="stream",
-                response={"insertText": suggestion, "token": token},
-                reply_to=request.number,
-                done=True,
-            )
-        )
-
-    def _token_from_request(self, request: InlineCompletionRequest, suggestion: int):
-        """Generate a deterministic token (for matching streamed messages)
-        using request number and suggestion number"""
-        return f"t{request.number}s{suggestion}"
-
-    def _template_inputs_from_request(self, request: InlineCompletionRequest) -> Dict:
-        suffix = request.suffix.strip()
-        filename = request.path.split("/")[-1] if request.path else "untitled"
-
-        return {
-            "prefix": request.prefix,
-            "suffix": suffix,
-            "language": request.language,
-            "filename": filename,
-            "stop": ["\n```"],
-        }
-
-    def _post_process_suggestion(
-        self, suggestion: str, request: InlineCompletionRequest
-    ) -> str:
-        """Remove spurious fragments from the suggestion.
+        llm = self.get_llm()
+        if not llm:
+            raise ValueError("Please select a model for inline completion.")
 
-        While most models (especially instruct and infill models do not require
-        any pre-processing, some models such as gpt-4 which only have chat APIs
-        may require removing spurious fragments. This function uses heuristics
-        and request data to remove such fragments.
-        """
-        # gpt-4 tends to add "```python" or similar
-        language = request.language or "python"
-        markdown_identifiers = {"ipython": ["ipython", "python", "py"]}
-        bad_openings = [
-            f"```{identifier}"
-            for identifier in markdown_identifiers.get(language, [language])
-        ] + ["```"]
-        for opening in bad_openings:
-            if suggestion.startswith(opening):
-                suggestion = suggestion[len(opening) :].lstrip()
-                # check for the prefix inclusion (only if there was a bad opening)
-                if suggestion.startswith(request.prefix):
-                    suggestion = suggestion[len(request.prefix) :]
-                break
-        return suggestion
+        async for reply in llm.stream_inline_completions(request):
+            self.reply(reply)
diff --git a/packages/jupyter-ai/jupyter_ai/completions/handlers/llm_mixin.py b/packages/jupyter-ai/jupyter_ai/completions/handlers/llm_mixin.py
index fa16920d..e31abae8 100644
--- a/packages/jupyter-ai/jupyter_ai/completions/handlers/llm_mixin.py
+++ b/packages/jupyter-ai/jupyter_ai/completions/handlers/llm_mixin.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, Type
+from logging import Logger
+from typing import Any, Dict, Optional, Type
 
 from jupyter_ai.config_manager import ConfigManager
 from jupyter_ai_magics.providers import BaseProvider
@@ -7,26 +8,24 @@
 class LLMHandlerMixin:
     """Base class containing shared methods and attributes used by LLM handler classes."""
 
-    # This could be used to derive `BaseChatHandler` too (there is a lot of duplication!),
-    # but it was decided against it to avoid introducing conflicts for backports against 1.x
-
     handler_kind: str
+    settings: dict
+    log: Logger
 
     @property
-    def config_manager(self) -> ConfigManager:
+    def jai_config_manager(self) -> ConfigManager:
         return self.settings["jai_config_manager"]
 
     @property
     def model_parameters(self) -> Dict[str, Dict[str, Any]]:
         return self.settings["model_parameters"]
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.llm = None
-        self.llm_params = None
-        self.llm_chain = None
+        self._llm: Optional[BaseProvider] = None
+        self._llm_params = None
 
-    def get_llm_chain(self):
+    def get_llm(self) -> Optional[BaseProvider]:
         lm_provider = self.config_manager.completions_lm_provider
         lm_provider_params = self.config_manager.completions_lm_provider_params
 
@@ -34,7 +33,7 @@ def get_llm_chain(self):
             return None
 
         curr_lm_id = (
-            f'{self.llm.id}:{lm_provider_params["model_id"]}' if self.llm else None
+            f'{self._llm.id}:{lm_provider_params["model_id"]}' if self._llm else None
         )
         next_lm_id = (
             f'{lm_provider.id}:{lm_provider_params["model_id"]}'
@@ -42,19 +41,23 @@ def get_llm_chain(self):
             else None
         )
 
+        should_recreate_llm = False
         if curr_lm_id != next_lm_id:
             self.log.info(
                 f"Switching {self.handler_kind} language model from {curr_lm_id} to {next_lm_id}."
             )
-            self.create_llm_chain(lm_provider, lm_provider_params)
-        elif self.llm_params != lm_provider_params:
+            should_recreate_llm = True
+        elif self._llm_params != lm_provider_params:
             self.log.info(
                 f"{self.handler_kind} model params changed, updating the llm chain."
             )
-            self.create_llm_chain(lm_provider, lm_provider_params)
+            should_recreate_llm = True
+
+        if should_recreate_llm:
+            self._llm = self.create_llm(lm_provider, lm_provider_params)
+            self._llm_params = lm_provider_params
 
-        self.llm_params = lm_provider_params
-        return self.llm_chain
+        return self._llm
 
     def get_model_parameters(
         self, provider: Type[BaseProvider], provider_params: Dict[str, str]
@@ -63,7 +66,13 @@ def get_model_parameters(
             f"{provider.id}:{provider_params['model_id']}", {}
         )
 
-    def create_llm_chain(
+    def create_llm(
         self, provider: Type[BaseProvider], provider_params: Dict[str, str]
-    ):
-        raise NotImplementedError("Should be implemented by subclasses")
+    ) -> BaseProvider:
+        unified_parameters = {
+            **provider_params,
+            **(self.get_model_parameters(provider, provider_params)),
+        }
+        llm = provider(**unified_parameters)
+
+        return llm
diff --git a/packages/jupyter-ai/jupyter_ai/completions/models.py b/packages/jupyter-ai/jupyter_ai/completions/models.py
index 50736540..e9679379 100644
--- a/packages/jupyter-ai/jupyter_ai/completions/models.py
+++ b/packages/jupyter-ai/jupyter_ai/completions/models.py
@@ -1,71 +1,17 @@
-from typing import List, Literal, Optional
-
-from langchain.pydantic_v1 import BaseModel
-
-
-class InlineCompletionRequest(BaseModel):
-    """Message send by client to request inline completions.
-
-    Prefix/suffix implementation is used to avoid the need for synchronising
-    the notebook state at every key press (subject to change in future)."""
-
-    # unique message ID generated by the client used to identify replies and
-    # to easily discard replies for older requests
-    number: int
-    # prefix should include full text of the current cell preceding the cursor
-    prefix: str
-    # suffix should include full text of the current cell preceding the cursor
-    suffix: str
-    # media type for the current language, e.g. `text/x-python`
-    mime: str
-    # whether to stream the response (if supported by the model)
-    stream: bool
-    # path to the notebook of file for which the completions are generated
-    path: Optional[str]
-    # language inferred from the document mime type (if possible)
-    language: Optional[str]
-    # identifier of the cell for which the completions are generated if in a notebook
-    # previous cells and following cells can be used to learn the wider context
-    cell_id: Optional[str]
-
-
-class InlineCompletionItem(BaseModel):
-    """The inline completion suggestion to be displayed on the frontend.
-
-    See JuptyerLab `InlineCompletionItem` documentation for the details.
-    """
-
-    insertText: str
-    filterText: Optional[str]
-    isIncomplete: Optional[bool]
-    token: Optional[str]
-
-
-class CompletionError(BaseModel):
-    type: str
-    traceback: str
-
-
-class InlineCompletionList(BaseModel):
-    """Reflection of JupyterLab's `IInlineCompletionList`."""
-
-    items: List[InlineCompletionItem]
-
-
-class InlineCompletionReply(BaseModel):
-    """Message sent from model to client with the infill suggestions"""
-
-    list: InlineCompletionList
-    # number of request for which we are replying
-    reply_to: int
-    error: Optional[CompletionError]
-
-
-class InlineCompletionStreamChunk(BaseModel):
-    """Message sent from model to client with the infill suggestions"""
-
-    type: Literal["stream"] = "stream"
-    response: InlineCompletionItem
-    reply_to: int
-    done: bool
-    error: Optional[CompletionError]
+from jupyter_ai_magics.models.completion import (
+    CompletionError,
+    InlineCompletionItem,
+    InlineCompletionList,
+    InlineCompletionReply,
+    InlineCompletionRequest,
+    InlineCompletionStreamChunk,
+)
+
+__all__ = [
+    "InlineCompletionRequest",
+    "InlineCompletionItem",
+    "CompletionError",
+    "InlineCompletionList",
+    "InlineCompletionReply",
+    "InlineCompletionStreamChunk",
+]
diff --git a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
index 6607d97d..561f00a1 100644
--- a/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
+++ b/packages/jupyter-ai/jupyter_ai/document_loaders/directory.py
@@ -8,13 +8,12 @@
 from langchain.document_loaders import PyPDFLoader
 from langchain.schema import Document
 from langchain.text_splitter import TextSplitter
-from pypdf import PdfReader
 
 
 # Uses pypdf which is used by PyPDFLoader from langchain
 def pdf_to_text(path):
-    reader = PdfReader(path)
-    text = "\n \n".join([page.extract_text() for page in reader.pages])
+    pages = PyPDFLoader(path)
+    text = "\n \n".join([page.page_content for page in pages.load_and_split()])
     return text
 
 
diff --git a/packages/jupyter-ai/jupyter_ai/tests/completions/test_handlers.py b/packages/jupyter-ai/jupyter_ai/tests/completions/test_handlers.py
index 0028356a..c5b5d1ee 100644
--- a/packages/jupyter-ai/jupyter_ai/tests/completions/test_handlers.py
+++ b/packages/jupyter-ai/jupyter_ai/tests/completions/test_handlers.py
@@ -1,8 +1,14 @@
 import json
 from types import SimpleNamespace
+from typing import Union
 
+import pytest
 from jupyter_ai.completions.handlers.default import DefaultInlineCompletionHandler
-from jupyter_ai.completions.models import InlineCompletionRequest
+from jupyter_ai.completions.models import (
+    InlineCompletionReply,
+    InlineCompletionRequest,
+    InlineCompletionStreamChunk,
+)
 from jupyter_ai_magics import BaseProvider
 from langchain_community.llms import FakeListLLM
 from pytest import fixture
@@ -17,28 +23,31 @@ class MockProvider(BaseProvider, FakeListLLM):
     models = ["model"]
 
     def __init__(self, **kwargs):
-        kwargs["responses"] = ["Test response"]
+        if "responses" not in kwargs:
+            kwargs["responses"] = ["Test response"]
         super().__init__(**kwargs)
 
 
 class MockCompletionHandler(DefaultInlineCompletionHandler):
-    def __init__(self):
+    def __init__(self, lm_provider=None, lm_provider_params=None):
         self.request = HTTPServerRequest()
         self.application = Application()
         self.messages = []
         self.tasks = []
         self.settings["jai_config_manager"] = SimpleNamespace(
-            completions_lm_provider=MockProvider,
-            completions_lm_provider_params={"model_id": "model"},
+            completions_lm_provider=lm_provider or MockProvider,
+            completions_lm_provider_params=lm_provider_params or {"model_id": "model"},
         )
         self.settings["jai_event_loop"] = SimpleNamespace(
             create_task=lambda x: self.tasks.append(x)
         )
         self.settings["model_parameters"] = {}
-        self.llm_params = {}
-        self.create_llm_chain(MockProvider, {"model_id": "model"})
+        self._llm_params = {}
+        self._llm = None
 
-    def write_message(self, message: str) -> None:  # type: ignore
+    def reply(
+        self, message: Union[InlineCompletionReply, InlineCompletionStreamChunk]
+    ) -> None:
         self.messages.append(message)
 
     async def handle_exc(self, e: Exception, _request: InlineCompletionRequest):
@@ -89,8 +98,44 @@ async def test_handle_request(inline_handler):
     assert suggestions[0].insertText == "Test response"
 
 
-async def test_handle_stream_request(inline_handler):
-    inline_handler.llm_chain = FakeListLLM(responses=["test"])
+@pytest.mark.parametrize(
+    "response,expected_suggestion",
+    [
+        ("```python\nTest python code\n```", "Test python code"),
+        ("```\ntest\n```\n   \n", "test"),
+        ("```hello```world```", "hello```world"),
+    ],
+)
+async def test_handle_request_with_spurious_fragments(response, expected_suggestion):
+    inline_handler = MockCompletionHandler(
+        lm_provider=MockProvider,
+        lm_provider_params={
+            "model_id": "model",
+            "responses": [response],
+        },
+    )
+    dummy_request = InlineCompletionRequest(
+        number=1, prefix="", suffix="", mime="", stream=False
+    )
+
+    await inline_handler.handle_request(dummy_request)
+    # should write a single reply
+    assert len(inline_handler.messages) == 1
+    # reply should contain a single suggestion
+    suggestions = inline_handler.messages[0].list.items
+    assert len(suggestions) == 1
+    # the suggestion should include insert text from LLM without spurious fragments
+    assert suggestions[0].insertText == expected_suggestion
+
+
+async def test_handle_stream_request():
+    inline_handler = MockCompletionHandler(
+        lm_provider=MockProvider,
+        lm_provider_params={
+            "model_id": "model",
+            "responses": ["test"],
+        },
+    )
     dummy_request = InlineCompletionRequest(
         number=1, prefix="", suffix="", mime="", stream=True
     )
@@ -102,16 +147,16 @@ async def test_handle_stream_request(inline_handler):
     # first reply should be empty to start the stream
     first = inline_handler.messages[0].list.items[0]
     assert first.insertText == ""
-    assert first.isIncomplete == True
+    assert first.isIncomplete is True
 
     # second reply should be a chunk containing the token
     second = inline_handler.messages[1]
     assert second.type == "stream"
-    assert second.response.insertText == "Test response"
-    assert second.done == False
+    assert second.response.insertText == "test"
+    assert second.done is False
 
     # third reply should be a closing chunk
     third = inline_handler.messages[2]
     assert third.type == "stream"
-    assert third.response.insertText == "Test response"
-    assert third.done == True
+    assert third.response.insertText == "test"
+    assert third.done is True
diff --git a/packages/jupyter-ai/pyproject.toml b/packages/jupyter-ai/pyproject.toml
index 763963c9..f5eb5e98 100644
--- a/packages/jupyter-ai/pyproject.toml
+++ b/packages/jupyter-ai/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "jupyterlab~=4.0",
     "aiosqlite>=0.18",
     "importlib_metadata>=5.2.0",
-    "jupyter_ai_magics",
+    "jupyter_ai_magics>=2.13.0",
     "dask[distributed]",
     "faiss-cpu",                 # Not distributed by official repo
     "typing_extensions>=4.5.0",
@@ -54,7 +54,7 @@ test = [
 
 dev = ["jupyter_ai_magics[dev]"]
 
-all = ["jupyter_ai_magics[all]"]
+all = ["jupyter_ai_magics[all]", "pypdf"]
 
 [tool.hatch.version]
 source = "nodejs"