diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..f16e8ba
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,4 @@
+.git
+models
+.venv
+db
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
new file mode 100644
index 0000000..dc27b42
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,111 @@
+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve CASALIOY
+labels: ["02 Bug Report"]
+body:
+  - type: markdown
+    attributes:
+      value: >
+        Thank you for taking the time to file a bug report. Before creating a new
+        issue, please make sure to take a few moments to check the issue tracker
+        for existing issues about the bug.
+
+  - type: textarea
+    id: env
+    attributes:
+      label: .env
+      description: Please share your exact .env file. *format it with ``` as in the example below.*
+      placeholder: |
+        ```
+        # Generic
+        MODEL_N_CTX=1024
+        TEXT_EMBEDDINGS_MODEL=sentence-transformers/all-MiniLM-L6-v2
+        TEXT_EMBEDDINGS_MODEL_TYPE=HF  # LlamaCpp or HF
+        USE_MLOCK=true
+
+        # Ingestion
+        PERSIST_DIRECTORY=db
+        DOCUMENTS_DIRECTORY=source_documents
+        INGEST_CHUNK_SIZE=500
+        INGEST_CHUNK_OVERLAP=50
+
+        # Generation
+        MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp
+        MODEL_PATH=eachadea/ggml-vicuna-7b-1.1/ggml-vic7b-q5_1.bin
+        MODEL_TEMP=0.8
+        MODEL_STOP=[STOP]
+        CHAIN_TYPE=stuff
+        N_RETRIEVE_DOCUMENTS=100 # How many documents to retrieve from the db
+        N_FORWARD_DOCUMENTS=6 # How many documents to forward to the LLM, chosen among those retrieved
+        N_GPU_LAYERS=4
+        ```
+
+    validations:
+      required: true
+
+  - type: input
+    id: system-info-python
+    attributes:
+      label: Python version
+      placeholder: python 3.11.3
+    validations:
+      required: true
+  - type: input
+    id: system-info-system
+    attributes:
+      label: System
+      placeholder: Ubuntu-22.04
+    validations:
+      required: true
+  - type: input
+    id: system-info-casalioy
+    attributes:
+      label: CASALIOY version
+      placeholder: A release number (ex. `0.0.8`) or a commit id (ex `13cce0e`)
+    validations:
+      required: true
+
+  - type: checkboxes
+    id: information-scripts-examples
+    attributes:
+      label: Information
+      description: "The problem arises when using:"
+      options:
+        - label: "The official example scripts"
+        - label: "My own modified scripts"
+
+  - type: checkboxes
+    id: related-components
+    attributes:
+      label: Related Components
+      description: "Select the components related to the issue (if applicable):"
+      options:
+        - label: "Document ingestion"
+        - label: "GUI"
+        - label: "Prompt answering"
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Please provide a [code sample](https://stackoverflow.com/help/minimal-reproducible-example) that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+        If you have code snippets, error messages, stack traces please provide them here as well.
+        Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+        Avoid screenshots when possible, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
+
+      placeholder: |
+        Steps to reproduce the behavior:
+
+          1.
+          2.
+          3.
+
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "A clear and concise description of what you would expect to happen."
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000..e647772
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,2 @@
+blank_issues_enabled: true
+version: 2.1
diff --git a/.github/ISSUE_TEMPLATE/documentation.yml b/.github/ISSUE_TEMPLATE/documentation.yml
new file mode 100644
index 0000000..736e77e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation.yml
@@ -0,0 +1,19 @@
+name: Documentation
+description: Report an issue related to the LangChain documentation.
+title: "DOC: <Please write a comprehensive title after the 'DOC: ' prefix>"
+labels: [03 - Documentation]
+
+body:
+- type: textarea
+  attributes:
+    label: "Issue with current documentation:"
+    description: >
+      Please make sure to leave a reference to the document/code you're
+      referring to.
+
+- type: textarea
+  attributes:
+    label: "Idea or request for content:"
+    description: >
+      Please describe as clearly as possible what topics you think are missing
+      from the current documentation.
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
new file mode 100644
index 0000000..eab3acb
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -0,0 +1,30 @@
+name: "\U0001F680 Feature request"
+description: Submit a proposal/request for a new CASALIOY feature
+labels: ["02 Feature Request"]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature proposal. Please provide links to any relevant GitHub repos, papers, or other resources if relevant.
+
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR?
diff --git a/.github/ISSUE_TEMPLATE/other.yml b/.github/ISSUE_TEMPLATE/other.yml
new file mode 100644
index 0000000..88ad5a3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/other.yml
@@ -0,0 +1,18 @@
+name: Other Issue
+description: Raise an issue that wouldn't be covered by the other templates.
+title: "Issue: <Please write a comprehensive title after the 'Issue: ' prefix>"
+labels: [04 - Other]
+
+body:
+  - type: textarea
+    attributes:
+      label: "Issue you'd like to raise."
+      description: >
+        Please describe the issue you'd like to raise as clearly as possible.
+        Make sure to include any relevant links or references.
+
+  - type: textarea
+    attributes:
+      label: "Suggestion:"
+      description: >
+        Please outline a suggestion to improve the issue here.
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 58da439..0322ba0 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -15,6 +15,6 @@ jobs:
           DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
           DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
         run: |
-          docker build . --file Dockerfile --tag su77ungr/casalioy:stable
+          docker build . -t su77ungr/casalioy:stable
           docker login --username $DOCKER_USERNAME --password $DOCKER_PASSWORD
           docker push su77ungr/casalioy:stable
diff --git a/Dockerfile b/Dockerfile
index 8219fa5..58caa7b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,48 @@
-FROM python:3.11
+###############################################
+# Base Image
+###############################################
+FROM python:3.11-slim as python-base
+# We set POETRY_VERSION=1.3.2 because 1.4.x has some weird legacy issues
+# CASALIOY_FORCE_CPU = we install cpu-only pytorch.
+ENV PYTHONFAULTHANDLER=1 \
+      PYTHONUNBUFFERED=1 \
+      PYTHONHASHSEED=random \
+      PIP_NO_CACHE_DIR=off \
+      PIP_DISABLE_PIP_VERSION_CHECK=on \
+      PIP_DEFAULT_TIMEOUT=100 \
+      POETRY_NO_INTERACTION=1 \
+      POETRY_VIRTUALENVS_IN_PROJECT=true \
+      POETRY_VERSION=1.3.2 \
+      CASALIOY_FORCE_CPU=true
+RUN apt-get update && apt-get install -y build-essential git htop gdb nano unzip curl && rm -rf /var/lib/apt/lists/*
+#RUN if [ "$CASALIOY_ENABLE_LLAMA_GPU" = "true" ]; then \
+#        apt-get install -y nvidia-cuda-toolkit nvidia-cuda-toolkit-gcc; \
+#    fi; \
+RUN pip install --upgrade setuptools virtualenv
 
+###############################################
+# Builder Image
+###############################################
+FROM python-base as builder-base
+RUN pip install "poetry==$POETRY_VERSION"
 WORKDIR /srv
 RUN git clone https://github.com/su77ungr/CASALIOY.git
 WORKDIR CASALIOY
+RUN poetry install --with GUI,LLM --without dev --sync
+RUN . .venv/bin/activate && pip install --force streamlit
+RUN . .venv/bin/activate && \
+    if [ "$CASALIOY_FORCE_CPU" = "true" ]; then \
+        pip install --force torch torchvision --index-url https://download.pytorch.org/whl/cpu; \
+    else \
+        pip install --force sentence_transformers; \
+    fi
 
-RUN pip3 install poetry
-RUN python3 -m poetry config virtualenvs.create false
-RUN python3 -m poetry install
-RUN python3 -m pip install --force streamlit sentence_transformers # Temp fix, see pyproject.toml
-RUN python3 -m pip uninstall -y llama-cpp-python
-RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 -m pip install llama-cpp-python  # GPU support
-RUN pre-commit install
+###############################################
+# Production Image
+###############################################
+FROM python-base as production
+COPY --from=builder-base /srv /srv
+WORKDIR /srv/CASALIOY
 COPY example.env .env
+RUN echo "source /srv/CASALIOY/.venv/bin/activate" >> ~/.bashrc
+RUN . .venv/bin/activate && python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
diff --git a/Dockerfile-GPU b/Dockerfile-GPU
new file mode 100644
index 0000000..bf98f96
--- /dev/null
+++ b/Dockerfile-GPU
@@ -0,0 +1,39 @@
+###############################################
+# Base Image
+###############################################
+FROM nvidia/cuda:12.1.1-base-ubuntu22.04 as base
+# We set POETRY_VERSION=1.3.2 because 1.4.x has some weird legacy issues
+ENV PYTHONFAULTHANDLER=1 \
+      PYTHONUNBUFFERED=1 \
+      PYTHONHASHSEED=random \
+      PIP_NO_CACHE_DIR=off \
+      PIP_DISABLE_PIP_VERSION_CHECK=on \
+      PIP_DEFAULT_TIMEOUT=100 \
+      POETRY_NO_INTERACTION=1 \
+      POETRY_VIRTUALENVS_IN_PROJECT=true \
+      POETRY_VERSION=1.3.2
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt install -y software-properties-common && add-apt-repository -y ppa:deadsnakes/ppa && apt-get install -y python3.11 python3.11-venv python3-pip build-essential git htop gdb nano unzip curl && rm -rf /var/lib/apt/lists/*
+RUN python3.11 -m pip install --upgrade setuptools virtualenv
+
+###############################################
+# Builder Image
+###############################################
+FROM base as builder-base
+RUN python3.11 -m pip install "poetry==$POETRY_VERSION"
+WORKDIR /srv
+RUN git clone https://github.com/su77ungr/CASALIOY.git
+WORKDIR CASALIOY
+RUN python3.11 -m poetry install --with GUI,LLM --without dev --sync
+RUN . .venv/bin/activate && pip install --force streamlit sentence_transformers
+RUN . .venv/bin/activate && pip uninstall -y llama-cpp-python && CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --force llama-cpp-python
+
+###############################################
+# Production Image
+###############################################
+FROM base as production
+COPY --from=builder-base /srv /srv
+WORKDIR /srv/CASALIOY
+COPY example.env .env
+RUN echo "source /srv/CASALIOY/.venv/bin/activate" >> ~/.bashrc
+RUN . .venv/bin/activate && python -c "import nltk; nltk.download('averaged_perceptron_tagger'); nltk.download('punkt')"
diff --git a/README.md b/README.md
index 3f7d6e6..f57edfa 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,10 @@
 <!--suppress HtmlDeprecatedAttribute -->
 <div align="center">
 
-> **NOTICE** NOW WITH
-<a href="#chat-inside-gui-new-feature"><img src="https://img.shields.io/badge/GUI-blue.svg" alt="Roadmap 2023">
+<a href="https://www.buymeacoffee.com/cassowary" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-white.png" alt="Buy Me A Coffee" height="30" width="140"></a>
+
+**NOW** WITH
+<a href="https://github.com/su77ungr/CASALIOY/discussions/76"><img src="https://img.shields.io/badge/GUI-blue.svg" alt="Roadmap 2023">
 <br>
 <p align="center">
 
@@ -15,8 +17,8 @@
 <img height="300" src="https://github.com/su77ungr/GEEB-GPT/assets/69374354/2e59734c-0de7-4057-be7a-14729e1d5acd" alt="Qdrant"><br>
 
 <a href="https://github.com/su77ungr/CASALIOY/issues/8"><img src="https://img.shields.io/badge/Feature-Requests-bc1439.svg" alt="Roadmap 2023"> [![Docker Pulls](https://badgen.net/docker/pulls/su77ungr/casalioy?icon=docker&label=pulls)](https://hub.docker.com/r/su77ungr/casalioy/)</a>
+![example workflow](https://github.com/su77ungr/CASALIOY/actions/workflows/docker-image.yml/badge.svg)
 
- <a href="https://www.buymeacoffee.com/cassowary" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="30" width="140"></a>
 <br><br>
 </p>
 The fastest toolkit for air-gapped LLMs
@@ -29,26 +31,20 @@ The fastest toolkit for air-gapped LLMs
 
 # Setup
 
-### Docker guide
+### Docker (⛔️ only supports Ubuntu rn) 
 
 ```bash
 docker pull su77ungr/casalioy:stable
 ```
 
 ```bash
-docker run -it su77ungr/casalioy:stable /bin/bash
+docker run -it --shm-size=16gb su77ungr/casalioy:stable /bin/bash
 ```
+
 for older docker without GUI use `casalioy:latest` might deprecate soon
 
 > Fetch the default models
 
-```
-cd models
-wget https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin &&
-wget https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin
-cd ../
-```
-
 > All set! Proceed with ingesting your [dataset](#ingesting-your-own-dataset)
 
 ### Build it from source
@@ -65,24 +61,20 @@ pre-commit install
 ```
 
 If you want GPU support for llama-ccp:
+
 ```shell
 pip uninstall -y llama-cpp-python
 CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --force llama-cpp-python
 ```
 
-> Download the 2 models and place them in a folder called `./models`:
-
-- LLM: default
-  is [ggml-vic7b-q5_1](https://huggingface.co/eachadea/ggml-vicuna-7b-1.1/resolve/main/ggml-vic7b-q5_1.bin)
-- Embedding: default
-  to [ggml-model-q4_0](https://huggingface.co/Pi3141/alpaca-native-7B-ggml/resolve/397e872bf4c83f4c642317a5bf65ce84a105786e/ggml-model-q4_0.bin).
-
 > > Edit the example.env to fit your models and rename it to .env
 
 ```env
 # Generic
 MODEL_N_CTX=1024
-LLAMA_EMBEDDINGS_MODEL=models/ggml-model-q4_0.bin
+TEXT_EMBEDDINGS_MODEL=sentence-transformers/all-MiniLM-L6-v2
+TEXT_EMBEDDINGS_MODEL_TYPE=HF  # LlamaCpp or HF
+USE_MLOCK=true
 
 # Ingestion
 PERSIST_DIRECTORY=db
@@ -92,9 +84,12 @@ INGEST_CHUNK_OVERLAP=50
 
 # Generation
 MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp
-MODEL_PATH=models/ggjt-v1-vic7b-uncensored-q4_0.bin
+MODEL_PATH=eachadea/ggml-vicuna-7b-1.1/ggml-vic7b-q5_1.bin
 MODEL_TEMP=0.8
-MODEL_STOP=###,\n
+MODEL_STOP=[STOP]
+CHAIN_TYPE=stuff
+N_RETRIEVE_DOCUMENTS=100 # How many documents to retrieve from the db
+N_FORWARD_DOCUMENTS=6 # How many documents to forward to the LLM, chosen among those retrieved
 ```
 
 This should look like this
@@ -102,32 +97,40 @@ This should look like this
 ```
 └── repo
       ├── startLLM.py
-      ├── ingest.py
+      ├── casalioy
+      │   └── ingest.py, load_env.py, startLLM.py, gui.py, ...
       ├── source_documents
       │   └── sample.csv
-      │   └── shor.pdfstate_of_the_union.txt
-      │   └── state_of_the_union.txt
+      │   └── ...
       ├── models
       │   ├── ggml-vic7b-q5_1.bin
-      │   └── ggml-model-q4_0.bin
+      │   └── ...
       └── .env, convert.py, Dockerfile
 ```
 
+
+> 👇 Update your installation!
+
+
+      git pull && poetry install
+
+
+
 ## Ingesting your own dataset
 
 To automatically ingest different data types (.txt, .pdf, .csv, .epub, .html, .docx, .pptx, .eml, .msg)
 
-> This repo includes dummy [files](https://github.com/imartinez/privateGPT/blob/main/source_documents/)
+> This repo includes dummy [files](https://github.com/su77ungr/CASALIOY/tree/main/source_documents)
 > inside `source_documents` to run tests with.
 
 ```shell
-python ingest.py # optional <path_to_your_data_directory>
+python casalioy/ingest.py # optional <path_to_your_data_directory>
 ```
 
 Optional: use `y` flag to purge existing vectorstore and initialize fresh instance
 
 ```shell
-python ingest.py # optional <path_to_your_data_directory> y
+python casalioy/ingest.py # optional <path_to_your_data_directory> y
 ```
 
 This spins up a local qdrant namespace inside the `db` folder containing the local vectorstore. Will take time,
@@ -140,7 +143,7 @@ database. To remove dataset simply remove `db` folder.
 In order to ask a question, run a command like:
 
 ```shell
-python startLLM.py
+python casalioy/startLLM.py
 ```
 
 And wait for the script to require your input.
@@ -163,7 +166,7 @@ Type `exit` to finish the script.
 Introduced by [@alxspiker](https://github.com/alxspiker) -> see [#21](https://github.com/su77ungr/CASALIOY/pull/21)
 
 ```shell
-streamlit run .\gui.py
+streamlit run casalioy/gui.py
 ```
 
 # LLM options
@@ -172,7 +175,6 @@ streamlit run .\gui.py
 
 | Model                                                                                                                                            | BoolQ | PIQA | HellaSwag | WinoGrande | ARC-e | ARC-c | OBQA | Avg. |
 |:-------------------------------------------------------------------------------------------------------------------------------------------------|:-----:|:----:|:---------:|:----------:|:-----:|:-----:|:----:|:----:|
-| [ggml-vic-7b-uncensored](https://huggingface.co/datasets/dnato/ggjt-v1-vic7b-uncensored-q4_0.bin/resolve/main/ggjt-v1-vic7b-uncensored-q4_0.bin) | 73.4  | 74.8 |   63.4    |    64.7    | 54.9  | 36.0  | 40.2 | 58.2 |
 | [GPT4All-13b-snoozy q5](https://huggingface.co/TheBloke/GPT4All-13B-snoozy-GGML/blob/main/GPT4All-13B-snoozy.ggml.q5_1.bin)                      | 83.3  | 79.2 |   75.0    |    71.3    | 60.9  | 44.2  | 43.4 | 65.3 |
 
 ### models inside of the GPT-J ecosphere
@@ -215,6 +217,7 @@ leaving your environment, and with reasonable performance.
 
 <br><br>
 
+
 # Disclaimer
 
 The contents of this repository are provided "as is" and without warranties of any kind, whether express or implied. We
diff --git a/casalioy/CustomChains.py b/casalioy/CustomChains.py
new file mode 100644
index 0000000..f2b6384
--- /dev/null
+++ b/casalioy/CustomChains.py
@@ -0,0 +1,160 @@
+"""Custom chains for LLM"""
+
+from langchain import PromptTemplate
+from langchain.base_language import BaseLanguageModel
+from langchain.chains.qa_generation.prompt import PROMPT_SELECTOR
+from langchain.schema import Document
+from langchain.vectorstores.base import VectorStoreRetriever
+
+from casalioy.load_env import (
+    model_n_ctx,
+    n_forward_documents,
+    n_retrieve_documents,
+)
+from casalioy.utils import print_HTML
+
+
+class BaseQA:
+    """base class for Question-Answering"""
+
+    def __init__(self, llm: BaseLanguageModel, retriever: VectorStoreRetriever, prompt: PromptTemplate = None):
+        self.llm = llm
+        self.retriever = retriever
+        self.prompt = prompt or self.default_prompt
+        self.retriever.search_kwargs = {**self.retriever.search_kwargs, "k": n_forward_documents, "fetch_k": n_retrieve_documents}
+
+    @property
+    def default_prompt(self) -> PromptTemplate:
+        """the default prompt"""
+        return PROMPT_SELECTOR.get_prompt(self.llm)
+
+    def fetch_documents(self, search: str) -> list[Document]:
+        """fetch documents from retriever"""
+        return self.retriever.get_relevant_documents(search)
+
+    def __call__(self, input_str: str) -> dict:
+        """ask a question, return results"""
+        return {"result": self.llm.predict(self.default_prompt.format_prompt(question=input_str).to_string())}
+
+
+class StuffQA(BaseQA):
+    """custom QA close to a stuff chain
+    compared to the default stuff chain which may exceed the context size, this chain loads as many documents as allowed by the context size.
+    Since it uses all the context size, it's meant for a "one-shot" question, not leaving space for a follow-up question which exactly contains the previous one.
+    """
+
+    @property
+    def default_prompt(self) -> PromptTemplate:
+        """the default prompt"""
+        prompt = """HUMAN:
+Answer the question using ONLY the given extracts from (possibly unrelated and irrelevant) documents, not your own knowledge.
+If you are unsure of the answer or if it isn't provided in the extracts, answer "Unknown[STOP]".
+Conclude your answer with "[STOP]" when you're finished.
+
+Question: {question}
+
+--------------
+Here are the extracts:
+{context}
+
+--------------
+Remark: do not repeat the question !
+
+ASSISTANT:
+"""
+        return PromptTemplate(template=prompt, input_variables=["context", "question"])
+
+    @staticmethod
+    def context_prompt_str(documents: list[Document]) -> str:
+        """the document's prompt"""
+        prompt = "".join(f"Extract {i + 1}: {document.page_content}\n\n" for i, document in enumerate(documents))
+        return prompt.strip()
+
+    def __call__(self, input_str: str) -> dict:
+        all_documents, documents = self.fetch_documents(input_str), []
+        for document in all_documents:
+            documents.append(document)
+            context_str = self.context_prompt_str(documents)
+            if (
+                self.llm.get_num_tokens(self.prompt.format_prompt(question=input_str, context=context_str).to_string())
+                > model_n_ctx - self.llm.dict()["max_tokens"]
+            ):
+                documents.pop()
+                break
+        print_HTML("<r>Stuffed {n} documents in the context</r>", n=len(documents))
+        context_str = self.context_prompt_str(documents)
+        formatted_prompt = self.prompt.format_prompt(question=input_str, context=context_str).to_string()
+        return {"result": self.llm.predict(formatted_prompt), "source_documents": documents}
+
+
+class RefineQA(BaseQA):
+    """custom QA close to a refine chain"""
+
+    @property
+    def default_prompt(self) -> PromptTemplate:
+        """the default prompt"""
+        prompt = f"""HUMAN:
+Answer the question using ONLY the given extracts from a (possibly irrelevant) document, not your own knowledge.
+If you are unsure of the answer or if it isn't provided in the extract, answer "Unknown[STOP]".
+Conclude your answer with "[STOP]" when you're finished.
+Avoid adding any extraneous information.
+
+Question:
+-----------------
+{{question}}
+
+Extract:
+-----------------
+{{context}}
+
+ASSISTANT:
+"""
+        return PromptTemplate(template=prompt, input_variables=["context", "question"])
+
+    @property
+    def refine_prompt(self) -> PromptTemplate:
+        """prompt to use for the refining step"""
+        prompt = f"""HUMAN:
+Refine the original answer to the question using the new (possibly irrelevant) document extract.
+Use ONLY the information from the extract and the previous answer, not your own knowledge.
+The extract may not be relevant at all to the question.
+Conclude your answer with "[STOP]" when you're finished.
+Avoid adding any extraneous information.
+
+Question:
+-----------------
+{{question}}
+
+Original answer:
+-----------------
+{{previous_answer}}
+
+New extract:
+-----------------
+{{context}}
+
+Reminder:
+-----------------
+If the extract is not relevant or helpful, don't even talk about it. Simply copy the original answer, without adding anything.
+Do not copy the question.
+
+ASSISTANT:
+"""
+        return PromptTemplate(template=prompt, input_variables=["context", "question", "previous_answer"])
+
+    def __call__(self, input_str: str) -> dict:
+        """ask a question"""
+        documents = self.fetch_documents(input_str)
+        last_answer, score = None, None
+        for i, doc in enumerate(documents):
+            print_HTML("<r>Refining from document {i}/{N}</r>", i=i + 1, N=len(documents))
+            prompt = self.default_prompt if i == 0 else self.refine_prompt
+            if i == 0:
+                formatted_prompt = prompt.format_prompt(question=input_str, context=doc.page_content)
+            else:
+                formatted_prompt = prompt.format_prompt(question=input_str, context=doc.page_content, previous_answer=last_answer)
+            last_answer = self.llm.predict(formatted_prompt.to_string())
+        return {
+            "result": f"{last_answer}",
+            "source_documents": documents,
+        }
diff --git a/casalioy/__init__.py b/casalioy/__init__.py
new file mode 100644
index 0000000..00d293a
--- /dev/null
+++ b/casalioy/__init__.py
@@ -0,0 +1 @@
+"""init"""
diff --git a/casalioy/ask_libgen.py b/casalioy/ask_libgen.py
new file mode 100644
index 0000000..3d634aa
--- /dev/null
+++ b/casalioy/ask_libgen.py
@@ -0,0 +1,76 @@
+"""answer questions using documents from LibGen"""
+import asyncio
+import logging
+import os
+import shutil
+from pathlib import Path
+
+from libgenesis import Libgen
+from prompt_toolkit import PromptSession
+from prompt_toolkit.shortcuts import ProgressBar
+
+from casalioy.ingest import Ingester
+from casalioy.load_env import (
+    chunk_overlap,
+    chunk_size,
+    get_embedding_model,
+    model_n_ctx,
+    model_path,
+    model_stop,
+    model_temp,
+    n_gpu_layers,
+    persist_directory,
+    use_mlock,
+)
+from casalioy.startLLM import QASystem
+from casalioy.utils import print_HTML, prompt_HTML
+
+max_doc_size_mb = 5
+out_path = Path("source_documents/libgen")
+
+logging.getLogger().setLevel(logging.WARNING)  # because libgenesis changes it
+
+if out_path.exists():
+    shutil.rmtree(out_path)
+os.mkdir(out_path)
+
+
+def load_documents(keyword: str, n: int = 3) -> None:
+    """load random documents from LG using keyword"""
+    lg = Libgen(result_limit=100)
+    result = asyncio.run(lg.search(keyword))
+    dl_N = 0
+    print_HTML(f"<r>Searching for interesting documents (max {n})</r>")
+    with ProgressBar() as pb:
+        for item_id in pb(result):
+            if dl_N >= n:
+                break
+            item = result[item_id]
+            if int(item["filesize"]) > 1024**2 * max_doc_size_mb:
+                continue
+            if item["extension"] not in ["pdf", "epub"]:
+                print_HTML("<r>skipped ext. {ext}</r>", ext=item["extension"])
+                continue
+            asyncio.run(lg.download(item["mirrors"]["main"], dest_folder=out_path))
+            dl_N += 1
+        if dl_N == 0:
+            raise ValueError(f"No good result for {keyword}")
+    print_HTML(f"<r>Got {dl_N} files</r>")
+
+
+def search(question: str, keyword: str) -> None:
+    """ask a question"""
+    load_documents(keyword)
+
+    Ingester(persist_directory, collection="libgen").ingest_from_directory(str(out_path), chunk_size, chunk_overlap)
+
+    qa = QASystem(get_embedding_model()[0], persist_directory, model_path, model_n_ctx, model_temp, model_stop, use_mlock, n_gpu_layers, collection="libgen")
+    qa.prompt_once(question)
+
+
+if __name__ == "__main__":
+    session = PromptSession()
+    question = prompt_HTML(session, "<b>Enter your question</b>: ")
+    keyword = prompt_HTML(session, "<b>Enter a keyword to search for relevant sources</b>: ")
+
+    search(question, keyword)
diff --git a/casalioy/gui.py b/casalioy/gui.py
new file mode 100644
index 0000000..57014ba
--- /dev/null
+++ b/casalioy/gui.py
@@ -0,0 +1,145 @@
+"""LLM through a GUI"""
+
+import streamlit as st
+from load_env import get_embedding_model, model_n_ctx, model_path, model_stop, model_temp, n_gpu_layers, persist_directory, use_mlock
+from streamlit_chat import message
+from streamlit_extras.add_vertical_space import add_vertical_space
+from streamlit_extras.colored_header import colored_header
+
+from casalioy import startLLM
+from casalioy.startLLM import QASystem
+from casalioy.utils import print_HTML
+
+title = "CASALIOY"
+
+
+@st.cache_resource
+def load_model(_params) -> QASystem:
+    """ensures the model is loaded"""
+    print_HTML("<r>Initializing...</r>")
+    return startLLM.QASystem(
+        get_embedding_model()[0],
+        persist_directory,
+        model_path,
+        _params["model_n_ctx"],
+        _params["model_temp"],
+        _params["model_stop"],
+        use_mlock,
+        n_gpu_layers,
+    )
+
+
+class UI:
+    r"""UI manager /!\ only one instance at a time"""
+
+    def init_state(self) -> None:
+        """initializes the state"""
+        if self.key_init not in st.session_state:
+            st.session_state.input = ""
+            st.session_state.running = False
+            st.session_state[self.key_init] = False
+            st.session_state.model_temp = model_temp
+            st.session_state.model_n_ctx = model_n_ctx
+            st.session_state.model_stop = ",".join(model_stop)
+            st.set_page_config(page_title=title)
+
+        if self.key_generated not in st.session_state:
+            st.session_state[self.key_generated] = ["I can help you answer questions about the documents you have ingested into the vector store."]
+
+        if self.key_past not in st.session_state:
+            st.session_state[self.key_past] = ["Hi, what can you help me with!"]
+
+    def build_interface(self) -> None:
+        """build the interface"""
+        with st.sidebar:  # Sidebar contents
+            st.title(title)
+            st.markdown(
+                """
+## About
+This app is an LLM-powered chatbot built using:
+- [Streamlit](https://streamlit.io/)
+- [su77ungr/CASALIOY](https://github.com/su77ungr/CASALIOY) LLM Toolkit
+
+💡 Note: No API key required!
+Refreshing the page will restart gui.py with a fresh chat history.
+CASALIOY will not remember previous questions as of yet.
+
+GUI does not support live response yet, so you have to wait for the tokens to process.
+    """
+            )
+            add_vertical_space(5)
+            st.write("Made with ❤️ by [su77ungr/CASALIOY](https://github.com/su77ungr/CASALIOY)")
+
+        # noinspection PyTypeChecker
+        colored_header(label="", description="", color_name="blue-30")
+
+        self.response_container = st.container()
+        with self.response_container:
+            st.write(
+                "WARNING: you need to modify those parameters BEFORE asking your first question. Modifying them later on does nothing. To change them, RELAUNCH the gui (not reload te page), edit their value, and ask your question."
+            )
+
+            # Parameter pickers
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.number_input("Temperature", key="model_temp", step=0.05, min_value=0.0, max_value=1.0)
+            with col2:
+                st.number_input("Context", key="model_n_ctx", step=512, min_value=512, max_value=9000)
+            with col3:
+                st.text_input("Stops", key="model_stop")
+
+            # Restore message history
+            if self.key_generated in st.session_state:
+                for i in range(len(st.session_state[self.key_generated])):
+                    message(st.session_state[self.key_past][i], is_user=True, key=f"{str(i)}_user")
+                    message(st.session_state[self.key_generated][i], key=str(i))
+
+            form = st.form(key="input-form", clear_on_submit=True)
+            with form:
+                st.form_submit_button(
+                    "SUBMIT",
+                    on_click=self.generate_response,
+                    disabled=st.session_state.running,
+                )
+                st.text_input("You: ", "", key="input", disabled=st.session_state.running)
+        st.session_state[self.key_init] = True
+
+    def __init__(self):
+        self.key_init = "initialized"
+        self.key_generated = "generated"
+        self.key_past = "past"
+        self.qa_system = None
+        self.response_container = None
+        self.init_state()
+        self.build_interface()
+
+    def generate_response(self) -> None:
+        """handle a message from the user"""
+        input_str = st.session_state.input
+        if not input_str.strip():
+            return
+
+        print_HTML(f"<r>Input:{input_str}</r>")
+
+        with self.response_container:
+            st.session_state.running = True
+            st.session_state[self.key_past].append(input_str)
+
+            message(input_str, is_user=True)
+            message(
+                "Loading response. Please wait for me to finish before refreshing the page...",
+                key="rmessage",
+            )
+
+            params = {
+                "model_n_ctx": st.session_state.model_n_ctx,
+                "model_temp": st.session_state.model_temp,
+                "model_stop": st.session_state.model_stop.split(","),
+            }
+            answer, sources = load_model(params).prompt_once(st.session_state.input)
+            st.session_state.input = ""
+            st.session_state[self.key_generated].append(answer)
+            st.session_state.running = False
+
+
+UI()
diff --git a/casalioy/ingest.py b/casalioy/ingest.py
new file mode 100644
index 0000000..5efc175
--- /dev/null
+++ b/casalioy/ingest.py
@@ -0,0 +1,168 @@
+"""ingest documents into vector database using embedding"""
+
+import contextlib
+import multiprocessing
+import os
+import shutil
+import sys
+from hashlib import md5
+from pathlib import Path
+from typing import Any, Callable
+
+from langchain.docstore.document import Document
+from langchain.document_loaders import (
+    CSVLoader,
+    OutlookMessageLoader,
+    PDFMinerLoader,
+    TextLoader,
+    UnstructuredEmailLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredWordDocumentLoader,
+)
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, ingest_n_threads, persist_directory
+from prompt_toolkit import PromptSession
+from prompt_toolkit.shortcuts import ProgressBar
+from qdrant_client import QdrantClient, models
+
+from casalioy.utils import print_HTML, prompt_HTML
+
+with contextlib.suppress(RuntimeError):
+    multiprocessing.set_start_method("spawn", force=True)
+
+
+class Ingester:
+    """ingest documents"""
+
+    file_loaders = {  # extension -> loader
+        "txt": lambda path: TextLoader(path, encoding="utf8"),
+        "pdf": PDFMinerLoader,
+        "csv": CSVLoader,
+        "epub": UnstructuredEPubLoader,
+        "html": UnstructuredHTMLLoader,
+        "docx": UnstructuredWordDocumentLoader,
+        "doc": UnstructuredWordDocumentLoader,
+        "pptx": UnstructuredPowerPointLoader,
+        "ppt": UnstructuredPowerPointLoader,
+        "eml": UnstructuredEmailLoader,
+        "msg": OutlookMessageLoader,
+    }
+
+    def __init__(self, db_dir: str, collection: str = "test", verbose=False):
+        self.n_threads = ingest_n_threads
+        self.encode_fun = None
+        self.text_splitter = None
+        self.db_dir = db_dir
+        self.collection = collection
+        self.verbose = verbose
+        self.awaiting_storage = []
+        self.store_N_batch = 1000
+
+    def load_one_doc(self, filepath: Path) -> list[Document]:
+        """load one document"""
+        if self.verbose:
+            print_HTML("<r>Processing {fname}</r>", fname=filepath.name)
+        if filepath.suffix[1:] not in self.file_loaders:
+            if self.verbose:
+                print_HTML("<w>Unhandled file format: {fname} in {fparent}</w>", fname=filepath.name, fparent=filepath.parent)
+            return []
+
+        return self.file_loaders[filepath.suffix[1:]](str(filepath)).load()
+
+    def embed_documents_with_progress(self, embedding_function: Callable, documents: list[Document]) -> list[tuple[Any, Document]]:
+        """wraps around embed_documents and saves"""
+        if self.verbose:
+            print_HTML(f"<r>Processing {len(documents)} chunks</r>")
+
+        embeddings = embedding_function([doc.page_content for doc in documents]).tolist()
+        return list(zip(embeddings, documents))
+
+    def store_embeddings(self, embeddings_and_docs: list[tuple[Any, Document]], force: bool = False) -> None:
+        """store embeddings in vector store"""
+        self.awaiting_storage += embeddings_and_docs
+        if not force and len(self.awaiting_storage) < self.store_N_batch:
+            return
+        client = QdrantClient(path=self.db_dir, prefer_grpc=True)
+        try:
+            client.get_collection(self.collection)
+        except ValueError:  # doesn't exist
+            # Just do a single quick embedding to get vector size
+            vector_size = max(len(e[0]) for e in self.awaiting_storage)
+            print_HTML(f"<r>Creating a new collection, vector size={vector_size}</r>")
+            client.recreate_collection(
+                collection_name=self.collection,
+                vectors_config=models.VectorParams(
+                    size=vector_size,
+                    distance=models.Distance["COSINE"],
+                ),
+            )
+
+        print_HTML(f"<r>Saving {len(self.awaiting_storage)} chunks</r>")
+        embeddings, texts, metadatas = (
+            [e[0] for e in self.awaiting_storage],
+            [e[1].page_content for e in self.awaiting_storage],
+            [e[1].metadata for e in self.awaiting_storage],
+        )
+        client.upsert(
+            collection_name=self.collection,
+            points=models.Batch.construct(
+                ids=[md5(text.encode("utf-8")).hexdigest() for text in texts],
+                vectors=embeddings,
+                payloads=[{"page_content": text, "metadata": metadatas[i]} for i, text in enumerate(texts)],
+            ),
+        )
+        collection = client.get_collection(self.collection)
+        self.awaiting_storage = []
+        if self.verbose:
+            print_HTML(f"<r>Saved, the collection now holds {collection.points_count} documents.</r>")
+
+    def process_one_doc(self, filepath: Path) -> list[tuple[Any, Document]] | None:
+        """process one doc"""
+        document = self.load_one_doc(filepath)
+        if not document:
+            return None
+        split_document = self.text_splitter.split_documents(document)
+        res = self.embed_documents_with_progress(self.encode_fun, split_document)
+        if self.verbose:
+            print_HTML("<r>Processed {fname}</r>", fname=filepath.name)
+        return res
+
+    def ingest_from_directory(self, path: str, chunk_size: int, chunk_overlap: int) -> None:
+        """ingest all supported files from the directory"""
+        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        self.encode_fun = get_embedding_model()[1]
+
+        # get all documents
+        print_HTML("<r>Scanning files</r>")
+        all_items = [Path(root) / file for root, dirs, files in os.walk(path) for file in files]
+        with ProgressBar() as pb:
+            with multiprocessing.Pool(self.n_threads) as pool:
+                for embeddings in pb(pool.imap_unordered(self.process_one_doc, all_items), total=len(all_items)):
+                    if embeddings is None:
+                        continue
+                    self.store_embeddings(embeddings)
+            self.store_embeddings(embeddings, force=True)
+        print_HTML("<r>Done</r>")
+
+
+def main(sources_directory: str, cleandb: str) -> None:
+    """main function"""
+    ingester = Ingester(persist_directory)
+    session = PromptSession()
+
+    if os.path.exists(ingester.db_dir):
+        if cleandb.lower() == "y" or (cleandb == "n" and prompt_HTML(session, "\n<b><w>Delete current database?(Y/N)</w></b>: ").lower() == "y"):
+            print_HTML("<r>Deleting db...</r>")
+            shutil.rmtree(ingester.db_dir)
+        elif cleandb.lower() == "n":
+            print_HTML("<r>Adding to db...</r>")
+
+    ingester.ingest_from_directory(sources_directory, chunk_size, chunk_overlap)
+
+
+if __name__ == "__main__":
+    sources_directory = sys.argv[1] if len(sys.argv) > 1 else documents_directory
+    cleandb = sys.argv[2] if len(sys.argv) > 2 else "n"
+    main(sources_directory, cleandb)
diff --git a/casalioy/load_env.py b/casalioy/load_env.py
new file mode 100644
index 0000000..65191e2
--- /dev/null
+++ b/casalioy/load_env.py
@@ -0,0 +1,94 @@
+"""load env variables"""
+import os
+from typing import Callable
+
+from dotenv import load_dotenv
+from langchain.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
+from langchain.prompts import PromptTemplate
+
+from casalioy.utils import download_if_repo
+
+load_dotenv()
+text_embeddings_model = os.environ.get("TEXT_EMBEDDINGS_MODEL")
+text_embeddings_model_type = os.environ.get("TEXT_EMBEDDINGS_MODEL_TYPE")
+use_mlock = os.environ.get("USE_MLOCK").lower() == "true"
+
+# ingest
+persist_directory = os.environ.get("PERSIST_DIRECTORY")
+documents_directory = os.environ.get("DOCUMENTS_DIRECTORY")
+chunk_size = int(os.environ.get("INGEST_CHUNK_SIZE"))
+chunk_overlap = int(os.environ.get("INGEST_CHUNK_OVERLAP"))
+ingest_n_threads = int(os.environ.get("INGEST_N_THREADS", 1))
+
+# generate
+model_type = os.environ.get("MODEL_TYPE")
+model_path = os.environ.get("MODEL_PATH")
+model_n_ctx = int(os.environ.get("MODEL_N_CTX"))
+model_max_tokens = int(os.environ.get("MODEL_MAX_TOKENS"))
+model_temp = float(os.environ.get("MODEL_TEMP", "0.8"))
+model_stop = os.environ.get("MODEL_STOP", "")
+model_stop = model_stop.split(",") if model_stop else []
+chain_type = os.environ.get("CHAIN_TYPE", "refine")
+n_retrieve_documents = int(os.environ.get("N_RETRIEVE_DOCUMENTS", 25))
+n_forward_documents = int(os.environ.get("N_FORWARD_DOCUMENTS", 3))
+n_gpu_layers = int(os.environ.get("N_GPU_LAYERS", 0))
+
+text_embeddings_model = download_if_repo(text_embeddings_model)
+model_path = download_if_repo(model_path)
+
+
+def get_embedding_model() -> tuple[HuggingFaceEmbeddings | LlamaCppEmbeddings, Callable]:
+    """get the text embedding model
+    :returns: tuple[the model, its encoding function]"""
+    match text_embeddings_model_type:
+        case "HF":
+            model = HuggingFaceEmbeddings(model_name=text_embeddings_model)
+            return model, model.client.encode
+        case "LlamaCpp":
+            model = LlamaCppEmbeddings(model_path=text_embeddings_model, n_ctx=model_n_ctx, n_gpu_layers=n_gpu_layers)
+            return model, lambda inpt: model.client.embed(inpt) if isinstance(inpt, str) else [
+                model.client.embed(e) for e in inpt
+            ]  # no batched embedding in llamacpp
+        case _:
+            raise ValueError(f"Unknown embedding type {text_embeddings_model_type}")
+
+
+def get_prompt_template_kwargs() -> dict[str, PromptTemplate]:
+    """get an improved prompt template"""
+    match chain_type:
+        case "stuff":
+            question_prompt = """HUMAN: Answer the question using ONLY the given context. If you are unsure of the answer, respond with "Unknown[STOP]". Conclude your response with "[STOP]" to indicate the completion of the answer.
+
+Context: {context}
+
+Question: {question}
+
+ASSISTANT:"""
+            return {"prompt": PromptTemplate(template=question_prompt, input_variables=["context", "question"])}
+        case "refine":
+            question_prompt = """HUMAN: Answer the question using ONLY the given context.
+Indicate the end of your answer with "[STOP]" and refrain from adding any additional information beyond that which is provided in the context.
+
+Question: {question}
+
+Context: {context_str}
+
+ASSISTANT:"""
+            refine_prompt = """HUMAN: Refine the original answer to the question using the new context.
+Use ONLY the information from the context and your previous answer.
+If the context is not helpful, use the original answer.
+Indicate the end of your answer with "[STOP]" and avoid adding any extraneous information.
+
+Original question: {question}
+
+Existing answer: {existing_answer}
+
+New context: {context_str}
+
+ASSISTANT:"""
+            return {
+                "question_prompt": PromptTemplate(template=question_prompt, input_variables=["context_str", "question"]),
+                "refine_prompt": PromptTemplate(template=refine_prompt, input_variables=["context_str", "existing_answer", "question"]),
+            }
+        case _:
+            return {}
diff --git a/casalioy/misc/convert.py b/casalioy/misc/convert.py
new file mode 100644
index 0000000..53c02ed
--- /dev/null
+++ b/casalioy/misc/convert.py
@@ -0,0 +1,1177 @@
+import concurrent.futures
+import copy
+import enum
+import faulthandler
+import functools
+import io
+import itertools
+import json
+import math
+import mmap
+import os
+import pickle
+import re
+import signal
+import struct
+import sys
+import zipfile
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union
+
+import numpy as np
+from dotenv import load_dotenv
+from sentencepiece import SentencePieceProcessor  # type: ignore
+
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
+if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
+    faulthandler.register(signal.SIGUSR1)
+
+NDArray: "TypeAlias" = "np.ndarray[Any, Any]"
+
+
+@dataclass(frozen=True)
+class UnquantizedDataType:
+    name: str
+
+
+DT_F16 = UnquantizedDataType("F16")
+DT_F32 = UnquantizedDataType("F32")
+DT_I32 = UnquantizedDataType("I32")
+DT_BF16 = UnquantizedDataType("BF16")
+
+
+@dataclass(frozen=True)
+class QuantizedDataType:
+    groupsize: int
+    have_addends: bool
+    have_g_idx: bool
+
+
+DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False)
+DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False)
+
+DataType = Union[UnquantizedDataType, QuantizedDataType]
+
+DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
+    DT_F32: 0,
+    DT_F16: 1,
+    DT_Q4_0: 2,
+    DT_Q4_1: 3,
+}
+
+FTYPE_TO_DATA_TYPE: Dict[int, DataType] = {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
+
+DATA_TYPE_TO_NUMPY: Dict[DataType, "np.dtype[Any]"] = {
+    DT_BF16: np.dtype(np.uint16),
+    DT_F16: np.dtype(np.float16),
+    DT_F32: np.dtype(np.float32),
+    DT_I32: np.dtype(np.int32),
+}
+
+NUMPY_TYPE_TO_DATA_TYPE: Dict["np.dtype[Any]", DataType] = {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+
+
+class GGMLFileType(enum.Enum):
+    AllF32 = 0
+    MostlyF16 = 1  # except 1d tensors
+    MostlyQ4_0 = 2  # except 1d tensors
+    MostlyQ4_1 = 3  # except 1d tensors
+    PerLayerIsQ4_1 = 4  # but tok_embeddings.weight and output.weight are F16
+
+    def type_for_tensor(self, name: str, tensor: "LazyTensor") -> DataType:
+        if len(tensor.shape) == 1:
+            # 1D tensors are always F32.
+            return DT_F32
+        elif self == GGMLFileType.AllF32:
+            return DT_F32
+        elif self == GGMLFileType.MostlyF16:
+            return DT_F16
+        elif self == GGMLFileType.MostlyQ4_0:
+            return DT_Q4_0
+        elif self == GGMLFileType.MostlyQ4_1:
+            return DT_Q4_1
+        elif self == GGMLFileType.PerLayerIsQ4_1:
+            if name in ("output.weight", "tok_embeddings.weight"):
+                return DT_F16
+            else:
+                return DT_Q4_1
+        else:
+            raise ValueError(self)
+
+
+def make_tensors_list() -> List[str]:
+    ret = [
+        "tok_embeddings.weight",
+        "norm.weight",
+        "output.weight",
+    ]
+    for i in range(80):  # maximum number of layer
+        ret += [
+            f"layers.{i}.attention.wq.weight",
+            f"layers.{i}.attention.wk.weight",
+            f"layers.{i}.attention.wv.weight",
+            f"layers.{i}.attention.wo.weight",
+            f"layers.{i}.attention_norm.weight",
+            f"layers.{i}.feed_forward.w1.weight",
+            f"layers.{i}.feed_forward.w2.weight",
+            f"layers.{i}.feed_forward.w3.weight",
+            f"layers.{i}.atttention_norm.weight",
+            f"layers.{i}.ffn_norm.weight",
+        ]
+    return ret
+
+
+TENSORS_LIST = make_tensors_list()
+TENSORS_SET = set(TENSORS_LIST)
+
+
+@dataclass
+class Params:
+    n_vocab: int
+    n_embd: int
+    n_mult: int
+    n_head: int
+    n_layer: int
+    file_type: GGMLFileType
+
+    @staticmethod
+    def guessed(model: "LazyModel", file_type: GGMLFileType) -> "Params":
+        n_vocab, n_embd = model["tok_embeddings.weight"].shape
+
+        return Params(
+            n_vocab=n_vocab,
+            n_embd=n_embd,
+            n_mult=256,
+            n_head=n_embd // 128,
+            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
+            file_type=file_type,
+        )
+
+
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: Dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens))
+        else:
+            added_tokens = {}
+        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            text: bytes
+            if tokenizer.is_unknown(i):
+                text = " \u2047 ".encode("utf-8")
+            elif tokenizer.is_control(i):
+                text = b""
+            elif tokenizer.is_byte(i):
+                piece = tokenizer.id_to_piece(i)
+                if len(piece) != 6:
+                    raise Exception(f"Invalid token: {piece}")
+                byte_value = int(piece[3:-1], 16)
+                text = struct.pack("B", byte_value)
+            else:
+                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            score: float = tokenizer.get_score(i)
+            yield text, score
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class GGMLVocab:
+    def __init__(self, tokens: List[Tuple[bytes, float]]):
+        self.tokens = tokens
+        self.vocab_size = len(tokens)
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        return self.tokens
+
+    def __repr__(self) -> str:
+        return f"<GGMLVocab with {self.vocab_size} tokens>"
+
+
+Vocab = Union[SentencePieceVocab, GGMLVocab]
+
+
+def permute(weights: NDArray, n_head: int) -> NDArray:
+    return weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]).swapaxes(1, 2).reshape(weights.shape)
+
+
+def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
+    # First reinterpret each row from a list of int32s containing 8 values each
+    # to a list of uint8s containing 2 values each.
+    qvalues_pack8 = qvalues_pack32.view(np.uint8)
+
+    # Then split out the two values per int8 (which requires an actual
+    # conversion because numpy doesn't natively support int4s).
+    qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
+    qvalues[:, 0::2] = qvalues_pack8 & 0xF
+    qvalues[:, 1::2] = qvalues_pack8 >> 4
+
+    assert addends is None or addends.shape == scales.shape
+    assert qvalues.shape[0] == scales.shape[0]
+    assert qvalues.shape[1] % scales.shape[1] == 0
+    if g_idx is None:
+        repeat_count = qvalues.shape[1] // scales.shape[1]
+        scales = scales[:, :, np.newaxis]
+        if addends is not None:
+            addends = addends[:, :, np.newaxis]
+        # Reshape so that the below computation broadcasts over scales and addends:
+        qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
+    else:
+        # In this case the scale and addend is selected for each column by g_idx:
+        assert addends is not None
+        scales = scales[:, g_idx]
+        addends = addends[:, g_idx]
+    if addends is None:
+        # Q4_0
+        qvalues = qvalues.view(np.int8)
+        qvalues -= 8
+    # And do the actual 'value = scale * qvalue + addend' computation.
+    values = scales * qvalues
+    if addends is not None:
+        values += addends
+    if g_idx is None:
+        values.shape = (values.shape[0], values.shape[1] * values.shape[2])
+    return values
+
+
+class Tensor(metaclass=ABCMeta):
+    data_type: DataType
+
+    @abstractmethod
+    def astype(self, data_type: DataType) -> "Tensor":
+        ...
+
+    @abstractmethod
+    def permute(self, n_head: int) -> "Tensor":
+        ...
+
+    @abstractmethod
+    def to_ggml(self) -> "GGMLCompatibleTensor":
+        ...
+
+
+def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
+    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
+    fp32_arr = bf16_arr.astype(np.uint32) << 16
+    return fp32_arr.view(np.float32)
+
+
+class UnquantizedTensor(Tensor):
+    def __init__(self, ndarray: NDArray) -> None:
+        assert isinstance(ndarray, np.ndarray)
+        self.ndarray = ndarray
+        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
+
+    def astype(self, data_type: DataType) -> Tensor:
+        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        if self.data_type == DT_BF16:
+            self.ndarray = bf16_to_fp32(self.ndarray)
+        return UnquantizedTensor(self.ndarray.astype(dtype))
+
+    def to_ggml(self) -> "UnquantizedTensor":
+        return self
+
+    def permute(self, n_head: int) -> "UnquantizedTensor":
+        return UnquantizedTensor(permute(self.ndarray, n_head))
+
+
+def load_unquantized(lazy_tensor: "LazyTensor", expected_dtype: Any = None, convert: bool = False) -> NDArray:
+    tensor = lazy_tensor.load()
+    assert isinstance(tensor, UnquantizedTensor)
+
+    # double-check:
+    actual_shape = list(tensor.ndarray.shape)
+    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
+    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
+        if convert:
+            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
+        else:
+            raise ValueError(f"expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}")
+
+    return tensor.ndarray
+
+
+class GGMLQuantizedTensor(Tensor):
+    data_type: QuantizedDataType
+
+    def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
+        rows, columns = shape
+        assert data_type in (DT_Q4_1, DT_Q4_0)  # for now
+        assert isinstance(data_type, QuantizedDataType)  # redundant, but mypy complains without this
+        assert columns % data_type.groupsize == 0
+        words_in_block = 6 if data_type == DT_Q4_1 else 5
+        self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block))
+        self.shape = shape[:]
+        self.data_type = data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        if data_type == self.data_type:
+            return self
+        scales = self.ndarray[:, :, 0].view(np.float32)
+        if self.data_type.have_addends:
+            addends = self.ndarray[:, :, 1].view(np.float32)
+        else:
+            addends = None
+        qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
+
+        dq = dequantize_q4(qweights, scales, addends, g_idx=None)
+        return UnquantizedTensor(dq).astype(data_type)
+
+    def to_ggml(self) -> "GGMLQuantizedTensor":
+        return self
+
+    def permute(self, n_head: int) -> "GGMLQuantizedTensor":
+        return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type)
+
+
+GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
+
+
+class DeferredPermutedTensor(Tensor):
+    def __init__(self, base: Tensor, n_head: int) -> None:
+        self.base = base
+        self.n_head = n_head
+        self.data_type = self.base.data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        return self.base.astype(data_type).permute(self.n_head)
+
+    def to_ggml(self) -> GGMLCompatibleTensor:
+        return self.base.to_ggml().permute(self.n_head)
+
+    def permute(self, n_head: int) -> Tensor:
+        raise Exception("shouldn't permute twice")
+
+
+class GPTQForLLaMaQuantizedTensor(Tensor):
+    def __init__(self, model: "LazyModel", namebase: str) -> None:
+        qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
+        scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
+
+        bias = model.get(f"{namebase}.bias")
+        if bias is not None:
+            # Q4_1 does not support bias; good thing the bias is always all zeros.
+            assert not np.any(load_unquantized(bias))
+
+        if f"{namebase}.zeros" in model:
+            zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
+        else:
+            qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
+            assert qzeros.dtype == np.int32
+            zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
+            assert zeros.dtype == np.float32
+
+        assert zeros.shape == scales.shape
+
+        # Output is transposed compared to the input, and addends have their sign flipped.
+        # Scales and zeros similarly must be transposed but only for newer
+        # versions of GPTQ-for-LLaMa; the older versions can be identified by
+        # having shape (n_embd, 1).
+        qweight = qweight.T
+        if scales.shape[1] != 1:
+            scales = scales.T
+            zeros = zeros.T
+
+        # Output also has signs flipped for the addends.
+        self.qweight = qweight
+        self.scales = scales
+        self.addends = -zeros
+
+        self.g_idx: Optional[NDArray]
+        if f"{namebase}.g_idx" in model:
+            self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
+            assert self.g_idx.shape == (qweight.shape[1] * 8,)
+        else:
+            self.g_idx = None
+
+        self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
+        self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True, have_g_idx=(self.g_idx is not None))
+
+    def inspect(self, row: int, col: int) -> None:
+        """For debugging."""
+        qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xF
+        if self.g_idx is not None:
+            group = self.g_idx[col]
+        else:
+            group = int(col // self.groupsize())
+        scale = self.scales[row, group]
+        addend = self.addends[row, group]
+        with np.printoptions(precision=None, suppress=True):
+            print(f"scale:{scale} addend:{addend} qweight:{qweight}")
+            print("possible values:", np.arange(16) * scale + addend)
+            print("actual value:", qweight * scale + addend)
+
+    def astype(self, data_type: DataType) -> Tensor:
+        if isinstance(data_type, QuantizedDataType):
+            assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
+            return self.regroup(data_type.groupsize)
+
+        dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
+        return UnquantizedTensor(dequantized).astype(data_type)
+
+    def groupsize(self) -> int:
+        assert self.addends.shape == self.scales.shape
+        assert self.shape[1] % self.scales.shape[1] == 0
+        return self.shape[1] // self.scales.shape[1]
+
+    def regroup(self, new_groupsize: int = 32) -> "GPTQForLLaMaQuantizedTensor":
+        # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
+        # columns in a row.  Newer versions share them between every set of N
+        # columns in a row, where N is the `groupsize` parameter, usually 128.  The
+        # output format shares them between every set of 32 columns.  To handle
+        # this, duplicate scales and addends for every smaller group.
+        # (In the above, 'row' and 'column' are in the sense of the output.)
+        assert self.g_idx is None
+        old_groupsize = self.groupsize()
+        assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
+        ret = copy.copy(self)
+        ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
+        ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
+        ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
+        return ret
+
+    def permute(self, n_head: int) -> Tensor:
+        return DeferredPermutedTensor(self, n_head)
+
+    def to_ggml(self) -> GGMLQuantizedTensor:
+        # The output format looks like this:
+        # For each row:
+        #   For each group of 32 columns:
+        #     - addend (float32, 4 bytes)
+        #     - scale (float32, 4 bytes)
+        #     - weights (int4 * 32, 16 bytes)
+
+        if self.groupsize() != 32:
+            raise Exception("should have been regrouped before converting to ggml")
+
+        # Since the output format is mixed between integers and floats, we have
+        # to hackily view the floats as int32s just so numpy will let us
+        # concatenate them.
+        addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
+        scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
+
+        # Split into groups of 4 columns (i.e. 32 columns of quantized data):
+        grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
+
+        # And concatenate:
+        grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting="no")
+
+        return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1)
+
+
+@dataclass
+class LazyTensor:
+    _load: Callable[[], Tensor]
+    shape: List[int]
+    data_type: DataType
+    description: str
+
+    def load(self) -> Tensor:
+        ret = self._load()
+        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
+        return ret
+
+    def astype(self, data_type: DataType) -> "LazyTensor":
+        self.validate_conversion_to(data_type)
+
+        def load() -> Tensor:
+            return self.load().astype(data_type)
+
+        return LazyTensor(load, self.shape, data_type, f"convert({data_type}) {self.description}")
+
+    def validate_conversion_to(self, data_type: DataType) -> None:
+        if data_type == self.data_type:
+            return
+        if isinstance(data_type, QuantizedDataType):
+            if not isinstance(self.data_type, QuantizedDataType):
+                raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
+            if self.data_type.have_g_idx:
+                sys.stderr.write(
+                    "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML.  For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n"
+                )
+                sys.exit(1)
+            assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
+
+
+LazyModel = Dict[str, LazyTensor]
+
+
+@dataclass
+class ModelPlus:
+    model: LazyModel
+    paths: List[Path]  # Where this was read from.
+    format: Literal["ggml", "torch", "safetensors"]
+    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.
+
+
+def merge_sharded(models: List[LazyModel]) -> LazyModel:
+    # Original LLaMA models have each file contain one part of each tensor.
+    # Use a dict instead of a set to preserve order.
+    names = {name: None for model in models for name in model}
+
+    def convert(name: str) -> LazyTensor:
+        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
+        if len(lazy_tensors) == 1:
+            # only one file; don't go through this procedure since there might
+            # be quantized tensors
+            return lazy_tensors[0]
+        if len(lazy_tensors[0].shape) == 1:
+            # the tensor is just duplicated in every file
+            return lazy_tensors[0]
+        if name.startswith("tok_embeddings.") or name.endswith(".attention.wo.weight") or name.endswith(".feed_forward.w2.weight"):
+            # split by columns
+            axis = 1
+        else:
+            # split by rows
+            axis = 0
+        concatenated_shape = list(lazy_tensors[0].shape)
+        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
+
+        def load() -> UnquantizedTensor:
+            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
+            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            return UnquantizedTensor(concatenated)
+
+        description = "concatenated[[" + "] | [".join(lt.description for lt in lazy_tensors) + "]]"
+        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
+
+    return {name: convert(name) for name in names}
+
+
+def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
+    formats = set(mp.format for mp in models_plus)
+    assert len(formats) == 1, "different formats?"
+    format = formats.pop()
+    paths = [path for mp in models_plus for path in mp.paths]
+    # Use the first non-None vocab, if any.
+    try:
+        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
+    except StopIteration:
+        vocab = None
+
+    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
+        # Transformers models put different tensors in different files, but
+        # don't split indivdual tensors between files.
+        model: LazyModel = {}
+        for mp in models_plus:
+            model.update(mp.model)
+    else:
+        model = merge_sharded([mp.model for mp in models_plus])
+
+    return ModelPlus(model, paths, format, vocab)
+
+
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute(n_head)
+
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f"permute({n_head}) " + lazy_tensor.description)
+
+
+def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
+    out: LazyModel = {}
+    out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
+    out["norm.weight"] = model["model.norm.weight"]
+    out["output.weight"] = model["lm_head.weight"]
+
+    n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
+    for i in itertools.count():
+        if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
+            break
+        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
+        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
+        out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+        out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
+
+        out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
+        out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
+        out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
+
+        out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
+        out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
+    return out
+
+
+def handle_quantization(model: LazyModel) -> LazyModel:
+    """Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
+    (which resolve to UnquantizedTensors with the raw data) to one with entries
+    for 'foo.weight' (which resolve to QuantizedTensors).
+    """
+
+    def convert(name: str) -> Tuple[str, LazyTensor]:
+        if name.endswith(".qweight"):
+            namebase = name.rsplit(".", 1)[0]
+            orig_name = namebase + ".weight"
+
+            lazy_tensor = model[name]
+            assert len(lazy_tensor.shape) == 2
+            real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
+
+            # Calculate type.  This replicates the logic in
+            # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
+            # actually loaded).
+            lazy_scales = model[f"{namebase}.scales"]
+            scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
+            assert real_shape[1] % scales_width == 0
+            groupsize = real_shape[1] // scales_width
+            have_g_idx = f"{namebase}.g_idx" in model
+            data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
+
+            def load() -> Tensor:
+                return GPTQForLLaMaQuantizedTensor(model, namebase)
+
+            return (orig_name, LazyTensor(load, real_shape, data_type, "[quantized]"))
+        else:
+            return (name, model[name])
+
+    return dict(convert(name) for name in model)
+
+
+# Functionality that simulates `torch.load` but where individual tensors are
+# only loaded into memory on demand, not all at once.
+# PyTorch can't do this natively as of time of writing:
+# - https://github.com/pytorch/pytorch/issues/64327
+# This allows us to de-shard without multiplying RAM usage, and also
+# conveniently drops the PyTorch dependency (though we still need numpy).
+
+
+@dataclass
+class LazyStorageKind:
+    data_type: DataType
+
+
+@dataclass
+class LazyStorage:
+    load: Callable[[int, int], NDArray]
+    kind: LazyStorageKind
+    description: str
+
+
+class LazyUnpickler(pickle.Unpickler):
+    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
+        super().__init__(fp)
+        self.data_base_path = data_base_path
+        self.zip_file = zip_file
+
+    def persistent_load(self, pid: Any) -> Any:
+        assert pid[0] == "storage"
+        assert isinstance(pid[1], LazyStorageKind)
+        data_type = pid[1].data_type
+        filename_stem = pid[2]
+        filename = self.data_base_path + "/" + filename_stem
+        info = self.zip_file.getinfo(filename)
+
+        def load(offset: int, elm_count: int) -> NDArray:
+            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
+            if dtype is None:
+                raise Exception("tensor stored in unsupported format")
+            fp = self.zip_file.open(info)
+            fp.seek(offset * dtype.itemsize)
+            size = elm_count * dtype.itemsize
+            data = fp.read(size)
+            assert len(data) == size
+            return np.frombuffer(data, dtype)
+
+        description = f"storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}"
+        return LazyStorage(load=load, kind=pid[1], description=description)
+
+    # @staticmethod
+    def lazy_rebuild_tensor_v2(
+        storage: Any,
+        storage_offset: Any,
+        size: Any,
+        stride: Any,  # pyright: ignore[reportSelfClsParameterName]
+        requires_grad: Any,
+        backward_hooks: Any,
+        metadata: Any = None,
+    ) -> LazyTensor:
+        assert isinstance(storage, LazyStorage)
+
+        def load() -> UnquantizedTensor:
+            elm_count = stride[0] * size[0]
+            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
+
+        description = f"pickled storage_offset={storage_offset} in {storage.description}"
+        return LazyTensor(load, list(size), storage.kind.data_type, description)
+
+    # @staticmethod
+    def rebuild_from_type_v2(func, new_type, args, state):
+        return func(*args)
+
+    CLASSES: Dict[Any, Any] = {
+        ("torch._tensor", "_rebuild_from_type_v2"): rebuild_from_type_v2,
+        ("torch._utils", "_rebuild_tensor_v2"): lazy_rebuild_tensor_v2,
+        ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
+        ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
+        ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
+        ("torch", "IntStorage"): LazyStorageKind(DT_I32),
+        ("torch", "Tensor"): LazyTensor,
+    }
+
+    def find_class(self, module: str, name: str) -> Any:
+        if not module.startswith("torch"):
+            return super().find_class(module, name)
+        return self.CLASSES[(module, name)]
+
+
+def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
+    zf = zipfile.ZipFile(outer_fp)
+    pickle_paths = [name for name in zf.namelist() if name.endswith(".pkl")]
+    assert len(pickle_paths) == 1, pickle_paths
+    pickle_fp = zf.open(pickle_paths[0], "r")
+    unpickler = LazyUnpickler(pickle_fp, data_base_path=pickle_paths[0][:-4], zip_file=zf)
+    model = unpickler.load()
+    as_dict = dict(model.items())
+    return ModelPlus(model=as_dict, paths=[path], format="torch", vocab=None)
+
+
+SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+    "F16": DT_F16,
+    "F32": DT_F32,
+    "I32": DT_I32,
+}
+
+
+def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
+    (header_size,) = struct.unpack("<Q", fp.read(8))
+    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    byte_buf = mapped[8 + header_size :]
+
+    def convert(info: Dict[str, Any]) -> LazyTensor:
+        data_type = SAFETENSORS_DATA_TYPES[info["dtype"]]
+        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+        shape: List[int] = info["shape"]
+        begin, end = info["data_offsets"]
+        assert 0 <= begin <= end <= len(byte_buf)
+        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
+        buf = byte_buf[begin:end]
+
+        def load() -> UnquantizedTensor:
+            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+
+        description = f"safetensors begin={begin} end={end} type={data_type} path={path}"
+        return LazyTensor(load, shape, data_type, description)
+
+    model = {name: convert(info) for (name, info) in header.items() if name != "__metadata__"}
+    return ModelPlus(model=model, paths=[path], format="safetensors", vocab=None)
+
+
+def must_read(fp: IO[bytes], length: int) -> bytes:
+    ret = fp.read(length)
+    if len(ret) < length:
+        raise Exception("unexpectedly reached end of file")
+    return ret
+
+
+def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
+    magic = must_read(fp, 4)[::-1]
+    if magic in (b"ggmf", b"ggjt"):
+        (version,) = struct.unpack("i", must_read(fp, 4))
+        assert version == 1
+    else:
+        assert magic == b"ggml"
+        version = None
+    n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack("<7i", must_read(fp, 28))
+
+    tokens: List[Tuple[bytes, float]] = []
+    for i in range(n_vocab):
+        if i == 32000:
+            # HACK: GPT4All messed with the format without changing the magic
+            # number.  Specifically, they changed the vocab section to contain
+            # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
+            # extra pad token).  Try to detect if we're reading a file like
+            # this.
+            orig_pos = fp.tell()
+            fp.seek(20, io.SEEK_CUR)
+            is_gpt4all = fp.read(21) == b"tok_embeddings.weight"
+            fp.seek(orig_pos)
+            if is_gpt4all:
+                break
+
+        (length,) = struct.unpack("i", must_read(fp, 4))
+        text = must_read(fp, length)
+        if magic != b"ggml":
+            (score,) = struct.unpack("f", must_read(fp, 4))
+            tokens.append((text, score))
+    vocab = GGMLVocab(tokens) if magic != b"ggml" else None
+
+    model: LazyModel = {}
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    off = fp.raw.tell()
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    fp.raw.seek(off)  # needed on Windows
+
+    def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
+        shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
+        assert 0 <= shape_len <= 3
+        shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
+        shape = shape[::-1]
+        name = must_read(fp, name_len).decode("utf-8")
+        data_type = FTYPE_TO_DATA_TYPE[ftype]
+
+        if magic == b"ggjt":
+            fp.seek((fp.tell() + 31) & -32)
+
+        if data_type == DT_Q4_1:
+            # See GPTQForLLaMaQuantizedTensor.ggml_ndarray()
+            size = 24 * (shape[1] // 32) * shape[0]
+        elif data_type == DT_Q4_0:
+            size = 20 * (shape[1] // 32) * shape[0]
+        else:
+            numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+            elm_count = math.prod(shape)
+            size = elm_count * numpy_dtype.itemsize
+        offset = fp.tell()
+        buf = mapped[offset : offset + size]
+        fp.seek(size, io.SEEK_CUR)
+
+        def load() -> Tensor:
+            if isinstance(data_type, QuantizedDataType):
+                ndarray = np.frombuffer(buf, dtype=np.uint32)
+                return GGMLQuantizedTensor(ndarray, shape, data_type)
+            else:
+                return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+
+        description = f"ggml offset={offset} type={data_type} path={path}"
+        model[name] = LazyTensor(load, shape, data_type, description)
+
+    while fp.read(1) != b"":
+        fp.seek(-1, io.SEEK_CUR)
+        read_tensor()
+
+    return ModelPlus(model=model, paths=[path], format="ggml", vocab=vocab)
+
+
+@functools.lru_cache(maxsize=None)
+def lazy_load_file(path: Path) -> ModelPlus:
+    fp = open(path, "rb")
+    first8 = fp.read(8)
+    model = first8.decode("utf-8")[:4][::-1]
+    print("Model type:" + model)
+    if model == "ggjt":
+        print("Model already the latest GGJT and no need to convert.")
+        exit()
+    else:
+        print("Converting GGML to GGJT")
+        fp.seek(0)
+        if first8[:2] == b"PK":
+            # A zip file, i.e. PyTorch format
+            return lazy_load_torch_file(fp, path)
+        elif first8[2:4] == b"gg":
+            # GGML format
+            return lazy_load_ggml_file(fp, path)
+        elif struct.unpack("<Q", first8)[0] < 16 * 1024 * 1024:
+            # Probably safetensors
+            return lazy_load_safetensors_file(fp, path)
+        else:
+            raise ValueError(f"unknown format: {path}")
+
+
+In = TypeVar("In")
+Out = TypeVar("Out")
+
+
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
+    """Parallel map, but with backpressure.  If the caller doesn't call `next`
+    fast enough, this will stop calling `func` at some point rather than
+    letting results pile up in memory.  Specifically, there is a max of one
+    output value buffered per thread."""
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures: List[concurrent.futures.Future[Out]] = []
+        items_rev = list(iterable)[::-1]
+        for i in range(min(concurrency, len(items_rev))):
+            futures.append(executor.submit(func, items_rev.pop()))
+        while futures:
+            result = futures.pop(0).result()
+            if items_rev:
+                futures.append(executor.submit(func, items_rev.pop()))
+            yield result
+
+
+def check_vocab_size(params: Params, vocab: Vocab) -> None:
+    if params.n_vocab != vocab.vocab_size:
+        # GGMLVocab comes from the same file as the model so shouldn't mismatch:
+        assert isinstance(vocab, SentencePieceVocab)
+        if params.n_vocab == vocab.vocab_size_base:
+            print("Ignoring added_tokens.json since model matches vocab size without it.")
+            vocab.added_tokens_list = []
+            vocab.vocab_size = vocab.vocab_size_base
+            return
+        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
+        if vocab.fname_added_tokens is not None:
+            msg += f" combined with {vocab.fname_added_tokens}"
+        msg += f" has {vocab.vocab_size})."
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        raise Exception(msg)
+
+
+class OutputFile:
+    def __init__(self, fname_out: Path) -> None:
+        self.fout = open(fname_out, "wb")
+
+    def write_file_header(self, params: Params) -> None:
+        self.fout.write(b"ggjt"[::-1])  # magic
+        values = [
+            1,  # file version
+            params.n_vocab,
+            params.n_embd,
+            params.n_mult,
+            params.n_head,
+            params.n_layer,
+            params.n_embd // params.n_head,  # rot (obsolete)
+            params.file_type.value,
+        ]
+        self.fout.write(struct.pack("i" * len(values), *values))
+
+    def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
+        sname = name.encode("utf-8")
+        self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
+        self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+        self.fout.write(sname)
+        self.fout.seek((self.fout.tell() + 31) & -32)
+
+    def write_vocab(self, vocab: Vocab) -> None:
+        for text, score in vocab.all_tokens():
+            self.fout.write(struct.pack("i", len(text)))
+            self.fout.write(text)
+            self.fout.write(struct.pack("f", score))
+
+    @staticmethod
+    def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
+        of = OutputFile(fname_out)
+        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
+        of = OutputFile(fname_out)
+        of.write_file_header(params)
+        of.write_vocab(vocab)
+        of.fout.close()
+
+    @staticmethod
+    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
+        check_vocab_size(params, vocab)
+        of = OutputFile(fname_out)
+        of.write_file_header(params)
+        print("Writing vocab...")
+        of.write_vocab(vocab)
+
+        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
+            name, lazy_tensor = item
+            return lazy_tensor.load().to_ggml().ndarray
+
+        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
+            of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
+            ndarray.tofile(of.fout)
+        of.fout.close()
+
+
+def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
+    wq_type = model["layers.0.attention.wq.weight"].data_type
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
+        return GGMLFileType.AllF32
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
+        return GGMLFileType.MostlyF16
+    if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and wq_type.have_addends):
+        if isinstance(model["output.weight"].data_type, QuantizedDataType):
+            return GGMLFileType.MostlyQ4_1
+        else:
+            return GGMLFileType.PerLayerIsQ4_1
+    if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
+        return GGMLFileType.MostlyQ4_0
+    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
+    raise Exception(f"Unexpected combination of types: {name_to_type}")
+
+
+def do_necessary_conversions(model: LazyModel) -> LazyModel:
+    model = handle_quantization(model)
+
+    if "lm_head.weight" in model:
+        model = convert_transformers_to_orig(model)
+    model = filter_and_sort_tensors(model)
+
+    return model
+
+
+def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
+    return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()}
+
+
+def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
+    """Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the nth path in the model.
+    """
+    # Support the following patterns:
+    patterns: List[Tuple[str, str]] = [
+        # - x.00.pth, x.01.pth, etc.
+        (r"\.[0-9]{2}\.pth$", f".{n:02}.pth"),
+        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
+        (r"-[0-9]{5}-of-(.*)$", rf"-{n:05}-of-\1"),
+        # x.bin, x.bin.1, etc.
+        (r"(\.[0-9]+)?$", r"\1" if n == 0 else rf"\1.{n}"),
+    ]
+    for regex, replacement in patterns:
+        if re.search(regex, path.name):
+            new_path = path.with_name(re.sub(regex, replacement, path.name))
+            if new_path.exists():
+                return new_path
+    return None
+
+
+def find_multifile_paths(path: Path) -> List[Path]:
+    """Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the whole list of paths in the model.
+    """
+    ret: List[Path] = []
+    for i in itertools.count():
+        nth_path = nth_multifile_path(path, i)
+        if nth_path is None:
+            break
+        ret.append(nth_path)
+    if not ret:
+        # No matches.  This should only happen if the file was named, e.g.,
+        # foo.0, and there was no file named foo.  Oh well, try to process it
+        # as a single file.
+        return [path]
+    return ret
+
+
+def load_some_model(path: Path) -> ModelPlus:
+    """Load a model of any supported format."""
+    # Be extra-friendly and accept either a file or a directory:
+    if path.is_dir():
+        # Check if it's a set of safetensors files first
+        files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files:
+            # Try the PyTorch patterns too, with lower priority
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
+            files = [file for glob in globs for file in path.glob(glob)]
+        if not files:
+            # Try GGML too, but with lower priority, since if both a non-GGML
+            # model and a GGML model exist in the same directory, we assume the
+            # latter was converted from the former.
+            files = list(path.glob("ggml-model*.bin*"))
+        if not files:
+            raise Exception(f"Can't find model in directory {path}")
+        if len(files) > 1:
+            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+        path = files[0]
+
+    paths = find_multifile_paths(path)
+    models_plus: List[ModelPlus] = []
+    for path in paths:
+        print(f"Loading model file {path}")
+        models_plus.append(lazy_load_file(path))
+
+    model_plus = merge_multifile_models(models_plus)
+    return model_plus
+
+
+def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
+    return {name: model[name] for name in TENSORS_LIST if name in model}
+
+
+def load_vocab(path: Path) -> SentencePieceVocab:
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        path2 = path / "tokenizer.model"
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / "tokenizer.model"
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
+    added_tokens_path = path.parent / "added_tokens.json"
+    print(f"Loading vocab file {path}")
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+
+
+def default_outfile(model_paths: List[Path], params: Params) -> Path:
+    namestr = {
+        GGMLFileType.AllF32: "f32",
+        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyQ4_0: "q4_0",
+        GGMLFileType.MostlyQ4_1: "q4_1",
+        GGMLFileType.PerLayerIsQ4_1: "q4_1",
+    }[params.file_type]
+    ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
+    if ret in model_paths:
+        sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input.  Please explicitly specify a path using --outfile.\n")
+        sys.exit(1)
+    return ret
+
+
+def do_dump_model(model_plus: ModelPlus) -> None:
+    print(f"model_plus.paths = {model_plus.paths!r}")
+    print(f"model_plus.format = {model_plus.format!r}")
+    print(f"model_plus.vocab = {model_plus.vocab!r}")
+    for name, lazy_tensor in model_plus.model.items():
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    # parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    # parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
+    # parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
+    # parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    # parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
+    # parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
+    # parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    # parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    # args = parser.parse_args(args_in)
+
+    load_dotenv()
+    print(os.environ.get("MODEL_PATH"))
+    model_path = sys.argv[1] if len(sys.argv) > 1 else os.environ.get("MODEL_PATH")
+
+    vocab: Vocab
+    model_plus = load_some_model(Path(model_path))
+    vocab = model_plus.vocab or load_vocab(model_plus.paths[0].parent)
+    model = model_plus.model
+    model = do_necessary_conversions(model)
+    output_type = pick_output_type(model, None)
+    model = convert_to_output_type(model, output_type)
+    params = Params.guessed(model, output_type)
+    outfile = Path(str(Path(model_path).parent) + "_new.bin")
+    OutputFile.write_all(outfile, params, model, vocab)
+    print(f"Wrote {outfile}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/casalioy/misc/dev_debug_formatting.py b/casalioy/misc/dev_debug_formatting.py
new file mode 100644
index 0000000..15e1ef8
--- /dev/null
+++ b/casalioy/misc/dev_debug_formatting.py
@@ -0,0 +1,18 @@
+"""dev utility to debug formatting problems arising in print_HTML"""
+from prompt_toolkit import HTML
+
+from casalioy.utils import print_HTML
+
+## Add to print_HTML
+# with open("temp.txt", "w", encoding="utf-8") as f:
+#     f.write(text.format(**kwargs))
+
+with open("temp.txt", "r", encoding="utf-8") as f:
+    s = f.read()
+
+escape_one = lambda v: v.replace("\f", " ").replace("\b", "\\")
+s = escape_one(s)
+
+print(s)
+print(HTML(s))
+print_HTML(s)
diff --git a/casalioy/misc/tokenizer.model b/casalioy/misc/tokenizer.model
new file mode 100644
index 0000000..22bccbc
Binary files /dev/null and b/casalioy/misc/tokenizer.model differ
diff --git a/casalioy/startLLM.py b/casalioy/startLLM.py
new file mode 100644
index 0000000..378abe6
--- /dev/null
+++ b/casalioy/startLLM.py
@@ -0,0 +1,135 @@
+"""start the local LLM"""
+
+import qdrant_client
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.chains import RetrievalQA
+from langchain.embeddings.base import Embeddings
+from langchain.vectorstores import Qdrant
+from prompt_toolkit import PromptSession
+from prompt_toolkit.auto_suggest import AutoSuggestFromHistory
+from prompt_toolkit.formatted_text.html import html_escape
+
+from casalioy.CustomChains import RefineQA, StuffQA
+from casalioy.load_env import (
+    chain_type,
+    get_embedding_model,
+    get_prompt_template_kwargs,
+    model_max_tokens,
+    model_n_ctx,
+    model_path,
+    model_stop,
+    model_temp,
+    model_type,
+    n_forward_documents,
+    n_gpu_layers,
+    n_retrieve_documents,
+    persist_directory,
+    use_mlock,
+)
+from casalioy.utils import print_HTML, prompt_HTML
+
+
+class QASystem:
+    """custom QA system"""
+
+    def __init__(
+        self,
+        embeddings: Embeddings,
+        db_path: str,
+        model_path: str,
+        n_ctx: int,
+        model_temp: float,
+        stop: list[str],
+        use_mlock: bool,
+        n_gpu_layers: int,
+        collection="test",
+    ):
+        # Get embeddings and local vector store
+        self.qdrant_client = qdrant_client.QdrantClient(path=db_path, prefer_grpc=True)
+        self.qdrant_langchain = Qdrant(client=self.qdrant_client, collection_name=collection, embeddings=embeddings)
+
+        # Prepare the LLM chain
+        callbacks = [StreamingStdOutCallbackHandler()]
+        match model_type:
+            case "LlamaCpp":
+                from langchain.llms import LlamaCpp
+
+                llm = LlamaCpp(
+                    model_path=model_path,
+                    n_ctx=n_ctx,
+                    temperature=model_temp,
+                    stop=stop,
+                    callbacks=callbacks,
+                    verbose=True,
+                    n_threads=6,
+                    n_batch=1000,
+                    use_mlock=use_mlock,
+                    n_gpu_layers=n_gpu_layers,
+                    max_tokens=model_max_tokens,
+                )
+                # Fix wrong default
+                object.__setattr__(llm, "get_num_tokens", lambda text: len(llm.client.tokenize(b" " + text.encode("utf-8"))))
+
+            case "GPT4All":
+                from langchain.llms import GPT4All
+
+                llm = GPT4All(
+                    model=model_path,
+                    n_ctx=n_ctx,
+                    callbacks=callbacks,
+                    verbose=True,
+                    backend="gptj",
+                )
+            case _:
+                raise ValueError("Only LlamaCpp or GPT4All supported right now. Make sure you set up your .env correctly.")
+
+        self.llm = llm
+        retriever = self.qdrant_langchain.as_retriever(search_type="mmr")
+        if chain_type == "betterstuff":
+            self.qa = StuffQA(retriever=retriever, llm=self.llm)
+        elif chain_type == "betterrefine":
+            self.qa = RefineQA(retriever=retriever, llm=self.llm)
+        else:
+            self.qa = RetrievalQA.from_chain_type(
+                llm=self.llm,
+                chain_type=chain_type,
+                retriever=retriever,
+                return_source_documents=True,
+                chain_type_kwargs=get_prompt_template_kwargs(),
+            )
+        self.qa.retriever.search_kwargs = {**self.qa.retriever.search_kwargs, "k": n_forward_documents, "fetch_k": n_retrieve_documents}
+
+    def prompt_once(self, query: str) -> tuple[str, str]:
+        """run a prompt"""
+        # Get the answer from the chain
+        res = self.qa(query)
+        answer, docs = res["result"], res["source_documents"]
+
+        # Print the result
+        sources_str = "\n\n".join(f">> <source>{html_escape(document.metadata['source'])}</source>:\n{html_escape(document.page_content)}" for document in docs)
+        print_HTML(
+            f"\n\n> <question><b>Question</b>: {query}</question>\n> <answer><b>Answer</b>: {answer}</answer>\n> <b>Sources</b>:\n{sources_str}",
+            query=query,
+            answer=answer,
+            sources_str=sources_str,
+        )
+
+        return answer, sources_str
+
+
+# noinspection PyMissingOrEmptyDocstring
+def main() -> None:
+    session = PromptSession(auto_suggest=AutoSuggestFromHistory())
+    qa_system = QASystem(get_embedding_model()[0], persist_directory, model_path, model_n_ctx, model_temp, model_stop, use_mlock, n_gpu_layers)
+    while True:
+        query = prompt_HTML(session, "\n<b>Enter a query</b>: ").strip()
+        if query == "exit":
+            break
+        elif not query:  # check if query empty
+            print_HTML("<r>Empty query, skipping</r>")
+            continue
+        qa_system.prompt_once(query)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/casalioy/utils.py b/casalioy/utils.py
new file mode 100644
index 0000000..0813ffd
--- /dev/null
+++ b/casalioy/utils.py
@@ -0,0 +1,95 @@
+"""some useful functions"""
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from huggingface_hub.utils import HFValidationError, validate_repo_id
+from prompt_toolkit import HTML, PromptSession, print_formatted_text
+from prompt_toolkit.styles import Style
+from pyexpat import ExpatError
+from requests import HTTPError
+
+style = Style.from_dict(
+    {
+        "r": "italic gray",  # remark
+        "w": "italic yellow",  # warning
+        "d": "bold red",  # danger
+        "b": "bold",
+        "i": "italic",
+        "question": "ansicyan",
+        "answer": "ansigreen",
+        "source": "ansimagenta",
+    }
+)
+
+
+def escape_for_html(text, **kwargs) -> str:
+    """escape unicode stuff and single curly braces. kwargs are changed in-place."""
+    escape_one = lambda v: v.replace("\f", " ").replace("\b", "\\")
+    escape_braces = lambda v: v.replace("{", "{{").replace("}", "}}")
+    for k, v in kwargs.items():
+        kwargs[k] = escape_braces(escape_one(str(v)))
+    text = escape_one(text)
+    return text
+
+
+def print_HTML(text: str, **kwargs) -> None:
+    """print formatted HTML text"""
+    try:
+        text = escape_for_html(text, **kwargs)
+        print_formatted_text(HTML(text).format(**kwargs), style=style)
+    except ExpatError:
+        # noinspection PyBroadException
+        try:
+            print(text.format(**kwargs))
+        except Exception:
+            print("[Could not properly parse text. This is a CASALIOY error, please open an issue.]", text, kwargs)
+
+
+def prompt_HTML(session: PromptSession, prompt: str, **kwargs) -> str:
+    """print formatted HTML text"""
+    try:
+        prompt = escape_for_html(prompt, **kwargs)
+        return session.prompt(HTML(prompt).format(**kwargs), style=style)
+    except ExpatError:
+        # noinspection PyBroadException
+        try:
+            return input(prompt.format(**kwargs))
+        except Exception:
+            print("[Could not properly parse text. This is a CASALIOY error, please open an issue.]", prompt, kwargs)
+
+
+def download_if_repo(path: str) -> str:
+    """download model from HF if not local"""
+    # check if dataset
+    split = path.split("/")
+    is_dataset = split[0] == "datasets"
+    is_file = path.endswith(".bin")
+    allow_patterns = split[-1] if is_file else ["*.bin", "*.json"]
+    repo_id = path
+    if is_dataset:
+        split = split[1:]
+        repo_id = "/".join(split)
+    if path.endswith(".bin"):
+        repo_id = "/".join(split[:2])
+
+    p = Path(path) if path.startswith("models/") else "models" / Path(path)
+    if (is_file and p.is_file()) or (not is_file and p.is_dir()):
+        print_HTML(f"<r>found local model {'file' if is_file else 'dir'} at {p}</r>")
+        return str(p)
+
+    try:
+        validate_repo_id(repo_id)
+        print_HTML("<r>Downloading {model_type} {model} from HF</r>", model=path, model_type="dataset" if is_dataset else "model")
+        new_path = Path(
+            snapshot_download(
+                repo_id=repo_id,
+                allow_patterns=allow_patterns,
+                local_dir=str(p.parent if is_file else p),
+                repo_type="dataset" if is_dataset else None,
+                local_dir_use_symlinks=False,
+            )
+        )
+        return str(new_path.resolve())
+
+    except (HFValidationError, HTTPError) as e:
+        print_HTML("<w>Could not download model {model} from HF: {e}</w>", model=path, e=e)
diff --git a/example.env b/example.env
index a954796..dbad1bf 100644
--- a/example.env
+++ b/example.env
@@ -1,18 +1,23 @@
 # Generic
-MODEL_N_CTX=1024
-TEXT_EMBEDDINGS_MODEL=all-MiniLM-L6-v2
+TEXT_EMBEDDINGS_MODEL=sentence-transformers/all-MiniLM-L6-v2
 TEXT_EMBEDDINGS_MODEL_TYPE=HF  # LlamaCpp or HF
-USE_MLOCK=true
+USE_MLOCK=false
 
 # Ingestion
 PERSIST_DIRECTORY=db
 DOCUMENTS_DIRECTORY=source_documents
 INGEST_CHUNK_SIZE=500
 INGEST_CHUNK_OVERLAP=50
+INGEST_N_THREADS=3
 
 # Generation
 MODEL_TYPE=LlamaCpp # GPT4All or LlamaCpp
-MODEL_PATH=models/ggml-vic7b-q5_1.bin
+MODEL_PATH=eachadea/ggml-vicuna-7b-1.1/ggml-vic7b-q5_1.bin
 MODEL_TEMP=0.8
+MODEL_N_CTX=1024  # Max total size of prompt+answer
+MODEL_MAX_TOKENS=256  # Max size of answer
 MODEL_STOP=[STOP]
-CHAIN_TYPE=stuff
+CHAIN_TYPE=betterstuff
+N_RETRIEVE_DOCUMENTS=100 # How many documents to retrieve from the db
+N_FORWARD_DOCUMENTS=100 # How many documents to forward to the LLM, chosen among those retrieved
+N_GPU_LAYERS=4
diff --git a/ingest.py b/ingest.py
deleted file mode 100644
index fe8d117..0000000
--- a/ingest.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""ingest documents into vector database using embedding"""
-import os
-import shutil
-import sys
-from hashlib import md5
-from pathlib import Path
-from typing import Callable
-
-from langchain.docstore.document import Document
-from langchain.document_loaders import (
-    CSVLoader,
-    Docx2txtLoader,
-    PDFMinerLoader,
-    TextLoader,
-    UnstructuredEPubLoader,
-    UnstructuredHTMLLoader,
-    UnstructuredPowerPointLoader,
-    UnstructuredEmailLoader,
-    OutlookMessageLoader
-)
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from qdrant_client import QdrantClient, models
-
-from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, persist_directory
-
-file_loaders = {  # extension -> loader
-    "txt": lambda path: TextLoader(path, encoding="utf8"),
-    "pdf": PDFMinerLoader,
-    "csv": CSVLoader,
-    "epub": UnstructuredEPubLoader,
-    "html": UnstructuredHTMLLoader,
-    "docx": Docx2txtLoader,
-    "pptx": UnstructuredPowerPointLoader,
-    "eml": UnstructuredEmailLoader,
-    "msg": OutlookMessageLoader,
-}
-
-
-def load_one_doc(filepath: Path) -> list[Document]:
-    """load one document"""
-    if filepath.suffix[1:] not in file_loaders:
-        print(f"Unhandled file format: {filepath.name} in {filepath.parent}")
-        return []
-
-    return file_loaders[filepath.suffix[1:]](str(filepath)).load()
-
-
-def embed_documents_with_progress(embedding_function: Callable, texts: list[str]) -> list[list[float]]:
-    """wrapper around embed_documents that prints progress"""
-    embeddings = []
-    N_chunks = len(texts)
-    for i, text in enumerate(texts):
-        print(f"embedding chunk {i + 1}/{N_chunks}")
-        embeddings.append(embedding_function(text))
-
-    return [list(map(float, e)) for e in embeddings]
-
-
-def main(sources_directory: str, cleandb: str) -> None:
-    """enables to run python random_path/ to ingest // or 'python random_path/ y' to purge existing db"""
-    db_dir = persist_directory  # can be changed to ":memory:" but is not persistant
-    if os.path.exists(db_dir):
-        if cleandb.lower() == "y" or (cleandb == "n" and input("\nDelete current database?(Y/N): ").lower() == "y"):
-            print("Deleting db...")
-            shutil.rmtree(db_dir)
-        elif cleandb.lower() == "n":
-            print("Adding to db...")
-
-    documents = []
-    for root, dirs, files in os.walk(sources_directory):
-        for file in files:
-            documents += load_one_doc(Path(root) / file)
-
-    # Split text
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    split_documents = text_splitter.split_documents(documents)
-    texts = [d.page_content for d in split_documents]
-    metadatas = [d.metadata for d in split_documents]
-    print(f"Found {len(split_documents)} chunks from {len(documents)} documents to index")
-
-    # Generate embeddings
-    print("Generating embeddings...")
-    embedding_model, encode_fun = get_embedding_model()
-    embeddings = embed_documents_with_progress(encode_fun, texts)
-
-    # Store embeddings
-    print("Storing embeddings...")
-    client = QdrantClient(path=db_dir, prefer_grpc=True)  # using Qdrant.from_documents recreates the db each time
-    try:
-        collection = client.get_collection("test")
-    except ValueError:  # doesn't exist
-        print("Creating a new store")
-        # Just do a single quick embedding to get vector size
-        vector_size = max(len(e) for e in embeddings)
-        client.recreate_collection(
-            collection_name="test",
-            vectors_config=models.VectorParams(
-                size=vector_size,
-                distance=models.Distance["COSINE"],
-            ),
-        )
-        collection = client.get_collection("test")
-    print(f"Loaded collection has {collection.points_count} data points")
-    client.upsert(
-        collection_name="test",
-        points=models.Batch.construct(
-            ids=[md5(text.encode("utf-8")).hexdigest() for text in texts],
-            vectors=embeddings,
-            payloads=[{"page_content": text, "metadata": metadatas[i]} for i, text in enumerate(texts)],
-        ),
-    )
-    collection = client.get_collection("test")
-    print(f"Indexed {len(split_documents)} chunks from {len(documents)} documents in Qdrant. Total points: {collection.points_count}")
-
-
-if __name__ == "__main__":
-    sources_directory = sys.argv[1] if len(sys.argv) > 1 else documents_directory
-    cleandb = sys.argv[2] if len(sys.argv) > 2 else "n"
-    main(sources_directory, cleandb)
diff --git a/meta.json b/meta.json
deleted file mode 100644
index 5af20a4..0000000
--- a/meta.json
+++ /dev/null
@@ -1 +0,0 @@
-{"collections": {"db": {"vectors": {"size": 4096, "distance": "Cosine", "hnsw_config": null, "quantization_config": null}, "shard_number": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null}}, "aliases": {}}
diff --git a/poetry.lock b/poetry.lock
index ec101a8..bf3d58c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,10 +1,22 @@
 # This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
 
+[[package]]
+name = "aiofiles"
+version = "23.1.0"
+description = "File support for asyncio."
+category = "main"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+    {file = "aiofiles-23.1.0-py3-none-any.whl", hash = "sha256:9312414ae06472eb6f1d163f555e466a23aed1c8f60c30cccf7121dba2e53eb2"},
+    {file = "aiofiles-23.1.0.tar.gz", hash = "sha256:edd247df9a19e0db16534d4baaf536d6609a43e1de5401d7a4c1c148753a1635"},
+]
+
 [[package]]
 name = "aiohttp"
 version = "3.8.4"
 description = "Async http client/server framework (asyncio)"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -113,7 +125,7 @@ speedups = ["Brotli", "aiodns", "cchardet"]
 name = "aiosignal"
 version = "1.3.1"
 description = "aiosignal: a list of registered asynchronous callbacks"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -203,7 +215,7 @@ server = ["PyYAML (>=5.4.1,<6.1.0)", "SQLAlchemy (>=2.0.0,<2.1.0)", "aiofiles (>
 name = "async-timeout"
 version = "4.0.2"
 description = "Timeout context manager for asyncio programs"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -215,7 +227,7 @@ files = [
 name = "attrs"
 version = "23.1.0"
 description = "Classes Without Boilerplate"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -246,7 +258,7 @@ files = [
 name = "beautifulsoup4"
 version = "4.12.2"
 description = "Screen-scraping library"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6.0"
 files = [
@@ -303,7 +315,6 @@ packaging = ">=22.0"
 pathspec = ">=0.9.0"
 platformdirs = ">=2"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
-typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
 
 [package.extras]
 colorama = ["colorama (>=0.4.3)"]
@@ -339,7 +350,7 @@ files = [
 name = "certifi"
 version = "2023.5.7"
 description = "Python package for providing Mozilla's CA Bundle."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -436,6 +447,18 @@ files = [
     {file = "cfgv-3.3.1.tar.gz", hash = "sha256:f5a830efb9ce7a445376bb66ec94c638a9787422f96264c98edc6bdeed8ab736"},
 ]
 
+[[package]]
+name = "chardet"
+version = "5.1.0"
+description = "Universal encoding detector for Python 3"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"},
+    {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"},
+]
+
 [[package]]
 name = "charset-normalizer"
 version = "3.1.0"
@@ -548,6 +571,18 @@ files = [
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
 ]
 
+[[package]]
+name = "colorclass"
+version = "2.2.2"
+description = "Colorful worry-free console applications for Linux, Mac OS X, and Windows."
+category = "dev"
+optional = false
+python-versions = ">=2.6"
+files = [
+    {file = "colorclass-2.2.2-py2.py3-none-any.whl", hash = "sha256:6f10c273a0ef7a1150b1120b6095cbdd68e5cf36dfd5d0fc957a2500bbf99a55"},
+    {file = "colorclass-2.2.2.tar.gz", hash = "sha256:6d4fe287766166a98ca7bc6f6312daf04a0481b1eda43e7173484051c0ab4366"},
+]
+
 [[package]]
 name = "commonmark"
 version = "0.9.1"
@@ -563,6 +598,17 @@ files = [
 [package.extras]
 test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"]
 
+[[package]]
+name = "compressed-rtf"
+version = "1.0.6"
+description = "Compressed Rich Text Format (RTF) compression and decompression package"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "compressed_rtf-1.0.6.tar.gz", hash = "sha256:c1c827f1d124d24608981a56e8b8691eb1f2a69a78ccad6440e7d92fde1781dd"},
+]
+
 [[package]]
 name = "contourpy"
 version = "1.0.7"
@@ -765,6 +811,29 @@ files = [
     {file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"},
 ]
 
+[[package]]
+name = "easygui"
+version = "0.98.3"
+description = "EasyGUI is a module for very simple, very easy GUI programming in Python.  EasyGUI is different from other GUI generators in that EasyGUI is NOT event-driven.  Instead, all GUI interactions are invoked by simple function calls."
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "easygui-0.98.3-py2.py3-none-any.whl", hash = "sha256:33498710c68b5376b459cd3fc48d1d1f33822139eb3ed01defbc0528326da3ba"},
+    {file = "easygui-0.98.3.tar.gz", hash = "sha256:d653ff79ee1f42f63b5a090f2f98ce02335d86ad8963b3ce2661805cafe99a04"},
+]
+
+[[package]]
+name = "ebcdic"
+version = "1.1.1"
+description = "Additional EBCDIC codecs"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ebcdic-1.1.1-py2.py3-none-any.whl", hash = "sha256:33b4cb729bc2d0bf46cc1847b0e5946897cb8d3f53520c5b9aa5fa98d7e735f1"},
+]
+
 [[package]]
 name = "entrypoints"
 version = "0.4"
@@ -789,16 +858,43 @@ files = [
     {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
 ]
 
+[[package]]
+name = "extract-msg"
+version = "0.41.1"
+description = "Extracts emails and attachments saved in Microsoft Outlook's .msg files"
+category = "dev"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "extract_msg-0.41.1-py2.py3-none-any.whl", hash = "sha256:b7a65d2efad09f756521d87c997626f4e5fc21a617a3f0bf4515f54ad5944dbf"},
+    {file = "extract_msg-0.41.1.tar.gz", hash = "sha256:873d3c4fd9a60a65147a23d40f8bbbe21c4d9b5197dddbf1535a9ef190aa86de"},
+]
+
+[package.dependencies]
+beautifulsoup4 = ">=4.11.1,<4.13"
+chardet = ">=4.0.0,<6"
+compressed-rtf = "1.0.6"
+ebcdic = "1.1.1"
+imapclient = ">=2.3.0,<3"
+olefile = "0.46"
+red-black-tree-mod = "1.20"
+RTFDE = "0.0.2"
+tzlocal = "4.2"
+
+[package.extras]
+all = ["extract-msg[mime]"]
+mime = ["python-magic (>=0.4.27,<0.5)"]
+
 [[package]]
 name = "faker"
-version = "18.7.0"
+version = "18.9.0"
 description = "Faker is a Python package that generates fake data for you."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Faker-18.7.0-py3-none-any.whl", hash = "sha256:38dbc3b80e655d7301e190426ab30f04b6b7f6ca4764c5dd02772ffde0fa6dcd"},
-    {file = "Faker-18.7.0.tar.gz", hash = "sha256:f02c6d3fdb5bc781f80b440cf2bdec336ed47ecfb8d620b20c3d4188ed051831"},
+    {file = "Faker-18.9.0-py3-none-any.whl", hash = "sha256:defe9ed618a67ebf0f3eb1895e198c2355a7128a09087a6dce342ef2253263ea"},
+    {file = "Faker-18.9.0.tar.gz", hash = "sha256:80a5ea1464556c06b98bf47ea3adc7f33811a1182518d847860b1874080bd3c9"},
 ]
 
 [package.dependencies]
@@ -824,7 +920,7 @@ requests = ">=2.21.0"
 name = "filelock"
 version = "3.12.0"
 description = "A platform independent file lock."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -866,7 +962,7 @@ woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"]
 name = "frozenlist"
 version = "1.3.3"
 description = "A list-like structure which implements collections.abc.MutableSequence"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -982,36 +1078,6 @@ smb = ["smbprotocol"]
 ssh = ["paramiko"]
 tqdm = ["tqdm"]
 
-[[package]]
-name = "gitdb"
-version = "4.0.10"
-description = "Git Object Database"
-category = "dev"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "gitdb-4.0.10-py3-none-any.whl", hash = "sha256:c286cf298426064079ed96a9e4a9d39e7f3e9bf15ba60701e95f5492f28415c7"},
-    {file = "gitdb-4.0.10.tar.gz", hash = "sha256:6eb990b69df4e15bad899ea868dc46572c3f75339735663b81de79b06f17eb9a"},
-]
-
-[package.dependencies]
-smmap = ">=3.0.1,<6"
-
-[[package]]
-name = "gitpython"
-version = "3.1.31"
-description = "GitPython is a Python library used to interact with Git repositories"
-category = "dev"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "GitPython-3.1.31-py3-none-any.whl", hash = "sha256:f04893614f6aa713a60cbbe1e6a97403ef633103cdd0ef5eb6efe0deb98dbe8d"},
-    {file = "GitPython-3.1.31.tar.gz", hash = "sha256:8ce3bcf69adfdf7c7d503e78fd3b1c492af782d58893b650adb2ac8912ddd573"},
-]
-
-[package.dependencies]
-gitdb = ">=4.0.1,<5"
-
 [[package]]
 name = "greenlet"
 version = "2.0.2"
@@ -1368,7 +1434,7 @@ license = ["ukkonen"]
 name = "idna"
 version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -1377,43 +1443,43 @@ files = [
 ]
 
 [[package]]
-name = "importlib-metadata"
-version = "6.6.0"
-description = "Read metadata from Python packages"
+name = "imapclient"
+version = "2.3.1"
+description = "Easy-to-use, Pythonic and complete IMAP client library"
 category = "dev"
 optional = false
-python-versions = ">=3.7"
+python-versions = "*"
 files = [
-    {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"},
-    {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"},
+    {file = "IMAPClient-2.3.1-py2.py3-none-any.whl", hash = "sha256:057f28025d2987c63e065afb0e4370b0b850b539b0e1494cea0427e88130108c"},
+    {file = "IMAPClient-2.3.1.zip", hash = "sha256:26ea995664fae3a88b878ebce2aff7402931697b86658b7882043ddb01b0e6ba"},
 ]
 
 [package.dependencies]
-zipp = ">=0.5"
+six = "*"
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-perf = ["ipython"]
-testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
+doc = ["sphinx"]
+test = ["mock (>=1.3.0)"]
 
 [[package]]
-name = "importlib-resources"
-version = "5.12.0"
-description = "Read resources from Python packages"
+name = "importlib-metadata"
+version = "6.6.0"
+description = "Read metadata from Python packages"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "importlib_resources-5.12.0-py3-none-any.whl", hash = "sha256:7b1deeebbf351c7578e09bf2f63fa2ce8b5ffec296e0d349139d43cca061a81a"},
-    {file = "importlib_resources-5.12.0.tar.gz", hash = "sha256:4be82589bf5c1d7999aedf2a45159d10cb3ca4f19b2271f8792bc8e6da7b22f6"},
+    {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"},
+    {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"},
 ]
 
 [package.dependencies]
-zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""}
+zipp = ">=0.5"
 
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+perf = ["ipython"]
+testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
 
 [[package]]
 name = "isort"
@@ -1563,14 +1629,14 @@ files = [
 
 [[package]]
 name = "langchain"
-version = "0.0.168"
+version = "0.0.171"
 description = "Building applications with LLMs through composability"
 category = "dev"
 optional = false
 python-versions = ">=3.8.1,<4.0"
 files = [
-    {file = "langchain-0.0.168-py3-none-any.whl", hash = "sha256:122590f9ccb343f19eefc5f7a548c96fab10d15fcf66fc33c0d9937b157f1c6b"},
-    {file = "langchain-0.0.168.tar.gz", hash = "sha256:ed1a38a5d0bff9f06250a928be25ca929567e36d409df8ca9f7a7a33a7b10790"},
+    {file = "langchain-0.0.171-py3-none-any.whl", hash = "sha256:ac014d1912bdbadf608120b29981e4177f293bcdf50e0987f682c1f34f3d3b3e"},
+    {file = "langchain-0.0.171.tar.gz", hash = "sha256:d32dba400c35a71221bb7e903175ee5ea4e9decf4354cedd070adf95fb1e4d16"},
 ]
 
 [package.dependencies]
@@ -1587,17 +1653,34 @@ SQLAlchemy = ">=1.4,<3"
 tenacity = ">=8.1.0,<9.0.0"
 
 [package.extras]
-all = ["O365 (>=2.0.26,<3.0.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.2.6,<0.3.0)", "arxiv (>=1.4,<2.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "beautifulsoup4 (>=4,<5)", "clickhouse-connect (>=0.5.14,<0.6.0)", "cohere (>=3,<4)", "deeplake (>=3.3.0,<4.0.0)", "docarray (>=0.31.0,<0.32.0)", "duckduckgo-search (>=2.8.6,<3.0.0)", "elasticsearch (>=8,<9)", "faiss-cpu (>=1,<2)", "google-api-python-client (==2.70.0)", "google-search-results (>=2,<3)", "gptcache (>=0.1.7)", "hnswlib (>=0.7.0,<0.8.0)", "html2text (>=2020.1.16,<2021.0.0)", "huggingface_hub (>=0,<1)", "jina (>=3.14,<4.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lancedb (>=0.1,<0.2)", "lark (>=1.1.5,<2.0.0)", "manifest-ml (>=0.0.1,<0.0.2)", "networkx (>=2.6.3,<3.0.0)", "nlpcloud (>=1,<2)", "nltk (>=3,<4)", "nomic (>=1.0.43,<2.0.0)", "openai (>=0,<1)", "opensearch-py (>=2.0.0,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pexpect (>=4.8.0,<5.0.0)", "pgvector (>=0.1.6,<0.2.0)", "pinecone-client (>=2,<3)", "pinecone-text (>=0.4.2,<0.5.0)", "protobuf (==3.19)", "psycopg2-binary (>=2.9.5,<3.0.0)", "pyowm (>=3.3.0,<4.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pytesseract (>=0.3.10,<0.4.0)", "pyvespa (>=0.33.0,<0.34.0)", "qdrant-client (>=1.1.2,<2.0.0)", "redis (>=4,<5)", "sentence-transformers (>=2,<3)", "spacy (>=3,<4)", "steamship (>=2.16.9,<3.0.0)", "tensorflow-text (>=2.11.0,<3.0.0)", "tiktoken (>=0.3.2,<0.4.0)", "torch (>=1,<3)", "transformers (>=4,<5)", "weaviate-client (>=3,<4)", "wikipedia (>=1,<2)", "wolframalpha (==5.0.0)"]
+all = ["O365 (>=2.0.26,<3.0.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.2.6,<0.3.0)", "arxiv (>=1.4,<2.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "beautifulsoup4 (>=4,<5)", "clickhouse-connect (>=0.5.14,<0.6.0)", "cohere (>=3,<4)", "deeplake (>=3.3.0,<4.0.0)", "docarray (>=0.31.0,<0.32.0)", "duckduckgo-search (>=2.8.6,<3.0.0)", "elasticsearch (>=8,<9)", "faiss-cpu (>=1,<2)", "google-api-python-client (==2.70.0)", "google-search-results (>=2,<3)", "gptcache (>=0.1.7)", "gql (>=3.4.1,<4.0.0)", "hnswlib (>=0.7.0,<0.8.0)", "html2text (>=2020.1.16,<2021.0.0)", "huggingface_hub (>=0,<1)", "jina (>=3.14,<4.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lancedb (>=0.1,<0.2)", "lark (>=1.1.5,<2.0.0)", "manifest-ml (>=0.0.1,<0.0.2)", "networkx (>=2.6.3,<3.0.0)", "nlpcloud (>=1,<2)", "nltk (>=3,<4)", "nomic (>=1.0.43,<2.0.0)", "openai (>=0,<1)", "opensearch-py (>=2.0.0,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pexpect (>=4.8.0,<5.0.0)", "pgvector (>=0.1.6,<0.2.0)", "pinecone-client (>=2,<3)", "pinecone-text (>=0.4.2,<0.5.0)", "protobuf (==3.19)", "psycopg2-binary (>=2.9.5,<3.0.0)", "pyowm (>=3.3.0,<4.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pytesseract (>=0.3.10,<0.4.0)", "pyvespa (>=0.33.0,<0.34.0)", "qdrant-client (>=1.1.2,<2.0.0)", "redis (>=4,<5)", "sentence-transformers (>=2,<3)", "spacy (>=3,<4)", "steamship (>=2.16.9,<3.0.0)", "tensorflow-text (>=2.11.0,<3.0.0)", "tiktoken (>=0.3.2,<0.4.0)", "torch (>=1,<3)", "transformers (>=4,<5)", "weaviate-client (>=3,<4)", "wikipedia (>=1,<2)", "wolframalpha (==5.0.0)"]
 azure = ["azure-core (>=1.26.4,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "openai (>=0,<1)"]
 cohere = ["cohere (>=3,<4)"]
 embeddings = ["sentence-transformers (>=2,<3)"]
-extended-testing = ["pdfminer-six (>=20221105,<20221106)", "pypdf (>=3.4.0,<4.0.0)", "tqdm (>=4.48.0)"]
+extended-testing = ["jq (>=1.4.1,<2.0.0)", "lxml (>=4.9.2,<5.0.0)", "pdfminer-six (>=20221105,<20221106)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "tqdm (>=4.48.0)"]
 hnswlib = ["docarray (>=0.31.0,<0.32.0)", "hnswlib (>=0.7.0,<0.8.0)", "protobuf (==3.19)"]
 in-memory-store = ["docarray (>=0.31.0,<0.32.0)"]
 llms = ["anthropic (>=0.2.6,<0.3.0)", "cohere (>=3,<4)", "huggingface_hub (>=0,<1)", "manifest-ml (>=0.0.1,<0.0.2)", "nlpcloud (>=1,<2)", "openai (>=0,<1)", "torch (>=1,<3)", "transformers (>=4,<5)"]
 openai = ["openai (>=0,<1)", "tiktoken (>=0.3.2,<0.4.0)"]
 qdrant = ["qdrant-client (>=1.1.2,<2.0.0)"]
 
+[[package]]
+name = "lark-parser"
+version = "0.12.0"
+description = "a modern parsing library"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "lark-parser-0.12.0.tar.gz", hash = "sha256:15967db1f1214013dca65b1180745047b9be457d73da224fcda3d9dd4e96a138"},
+    {file = "lark_parser-0.12.0-py2.py3-none-any.whl", hash = "sha256:0eaf30cb5ba787fe404d73a7d6e61df97b21d5a63ac26c5008c78a494373c675"},
+]
+
+[package.extras]
+atomic-cache = ["atomicwrites"]
+nearley = ["js2py"]
+regex = ["regex"]
+
 [[package]]
 name = "libcst"
 version = "0.4.9"
@@ -1646,6 +1729,26 @@ typing-inspect = ">=0.4.0"
 [package.extras]
 dev = ["Sphinx (>=5.1.1)", "black (==22.10.0)", "coverage (>=4.5.4)", "fixit (==0.1.1)", "flake8 (>=3.7.8,<5)", "hypothesis (>=4.36.0)", "hypothesmith (>=0.0.4)", "jinja2 (==3.1.2)", "jupyter (>=1.0.0)", "maturin (>=0.8.3,<0.14)", "nbsphinx (>=0.4.2)", "prompt-toolkit (>=2.0.9)", "pyre-check (==0.9.9)", "setuptools-rust (>=1.5.2)", "setuptools-scm (>=6.0.1)", "slotscheck (>=0.7.1)", "sphinx-rtd-theme (>=0.4.3)", "ufmt (==2.0.1)", "usort (==1.0.5)"]
 
+[[package]]
+name = "libgenesis"
+version = "0.1.9"
+description = "Asynchronous python lib for Libgen.rs"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "libgenesis-0.1.9-py3-none-any.whl", hash = "sha256:7f34c48de4fa4d3642589a6504540015dee22d74cb242de56b12976f486b124f"},
+    {file = "libgenesis-0.1.9.tar.gz", hash = "sha256:fa32508a380c2fd0fc0c9d07931b5b4c1beaeed79c01c932658777716d2ad23f"},
+]
+
+[package.dependencies]
+aiofiles = ">=0.5.0"
+aiohttp = ">=3.6.0"
+beautifulsoup4 = ">=4.9.0"
+lxml = ">=4.6.5"
+requests = ">=2.25.0"
+tldextract = ">=3.1.0"
+
 [[package]]
 name = "llama-cpp-python"
 version = "0.1.50"
@@ -1664,7 +1767,7 @@ typing-extensions = ">=4.5.0,<5.0.0"
 name = "lxml"
 version = "4.9.2"
 description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
 files = [
@@ -1765,9 +1868,6 @@ files = [
     {file = "Markdown-3.4.3.tar.gz", hash = "sha256:8bf101198e004dc93e84a12a7395e31aac6a9c9942848ae1d99b9d72cf9b3520"},
 ]
 
-[package.dependencies]
-importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""}
-
 [package.extras]
 testing = ["coverage", "pyyaml"]
 
@@ -1943,7 +2043,6 @@ files = [
 contourpy = ">=1.0.1"
 cycler = ">=0.10"
 fonttools = ">=4.22.0"
-importlib-resources = {version = ">=3.2.0", markers = "python_version < \"3.10\""}
 kiwisolver = ">=1.0.1"
 numpy = ">=1.20"
 packaging = ">=20.0"
@@ -2011,11 +2110,27 @@ olefile = ">=0.46"
 [package.extras]
 rtf = ["compressed-rtf (>=1.0.5)"]
 
+[[package]]
+name = "msoffcrypto-tool"
+version = "5.0.1"
+description = "Python tool and library for decrypting MS Office files with passwords or other keys"
+category = "dev"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+    {file = "msoffcrypto_tool-5.0.1-py3-none-any.whl", hash = "sha256:2b489c8a2b13bec07b94c8f5ce9054111dec3223ff8bedfd486cae3c299be54b"},
+    {file = "msoffcrypto_tool-5.0.1.tar.gz", hash = "sha256:9efd0ef5cc3e086e2d175e7a5d7b2b8cb59836c896b8a486d362bbca166db645"},
+]
+
+[package.dependencies]
+cryptography = ">=35.0"
+olefile = ">=0.46"
+
 [[package]]
 name = "multidict"
 version = "6.0.4"
 description = "multidict implementation"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -2259,6 +2374,29 @@ files = [
     {file = "olefile-0.46.zip", hash = "sha256:133b031eaf8fd2c9399b78b8bc5b8fcbe4c31e85295749bb17a87cba8f3c3964"},
 ]
 
+[[package]]
+name = "oletools"
+version = "0.60.1"
+description = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "oletools-0.60.1-py2.py3-none-any.whl", hash = "sha256:edef92374e688989a39269eb9a11142fb20a023629c23538c849c14d1d1144ea"},
+    {file = "oletools-0.60.1.zip", hash = "sha256:67a796da4c4b8e2feb9a6b2495bef8798a3323a75512de4e5669d9dc9d1fae31"},
+]
+
+[package.dependencies]
+colorclass = "*"
+easygui = "*"
+msoffcrypto-tool = {version = "*", markers = "platform_python_implementation != \"PyPy\" or python_version >= \"3\" and platform_system != \"Windows\" and platform_system != \"Darwin\""}
+olefile = ">=0.46"
+pcodedmp = ">=1.2.5"
+pyparsing = ">=2.1.0,<3"
+
+[package.extras]
+full = ["XLMMacroDeobfuscator"]
+
 [[package]]
 name = "openapi-schema-pydantic"
 version = "1.2.4"
@@ -2340,7 +2478,6 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
     {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
     {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
 ]
@@ -2350,18 +2487,6 @@ pytz = ">=2020.1"
 [package.extras]
 test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
 
-[[package]]
-name = "pathlib"
-version = "1.0.1"
-description = "Object-oriented filesystem paths"
-category = "main"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pathlib-1.0.1-py3-none-any.whl", hash = "sha256:f35f95ab8b0f59e6d354090350b44a80a80635d22efdedfa84c7ad1cf0a74147"},
-    {file = "pathlib-1.0.1.tar.gz", hash = "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"},
-]
-
 [[package]]
 name = "pathspec"
 version = "0.10.3"
@@ -2374,6 +2499,22 @@ files = [
     {file = "pathspec-0.10.3.tar.gz", hash = "sha256:56200de4077d9d0791465aa9095a01d421861e405b5096955051deefd697d6f6"},
 ]
 
+[[package]]
+name = "pcodedmp"
+version = "1.2.6"
+description = "A VBA p-code disassembler"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pcodedmp-1.2.6-py2.py3-none-any.whl", hash = "sha256:4441f7c0ab4cbda27bd4668db3b14f36261d86e5059ce06c0828602cbe1c4278"},
+    {file = "pcodedmp-1.2.6.tar.gz", hash = "sha256:025f8c809a126f45a082ffa820893e6a8d990d9d7ddb68694b5a9f0a6dbcd955"},
+]
+
+[package.dependencies]
+oletools = ">=0.54"
+win-unicode-console = {version = "*", markers = "platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""}
+
 [[package]]
 name = "pdfminer-six"
 version = "20221105"
@@ -2530,27 +2671,42 @@ nodeenv = ">=0.11.1"
 pyyaml = ">=5.1"
 virtualenv = ">=20.10.0"
 
+[[package]]
+name = "prompt-toolkit"
+version = "3.0.38"
+description = "Library for building powerful interactive command lines in Python"
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "prompt_toolkit-3.0.38-py3-none-any.whl", hash = "sha256:45ea77a2f7c60418850331366c81cf6b5b9cf4c7fd34616f733c5427e6abbb1f"},
+    {file = "prompt_toolkit-3.0.38.tar.gz", hash = "sha256:23ac5d50538a9a38c8bde05fecb47d0b403ecd0662857a86f886f798563d5b9b"},
+]
+
+[package.dependencies]
+wcwidth = "*"
+
 [[package]]
 name = "protobuf"
-version = "4.23.0"
+version = "4.23.1"
 description = ""
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "protobuf-4.23.0-cp310-abi3-win32.whl", hash = "sha256:6c16657d6717a0c62d5d740cb354fbad1b0d8cb811669e06fc1caa0ff4799ddd"},
-    {file = "protobuf-4.23.0-cp310-abi3-win_amd64.whl", hash = "sha256:baca40d067dddd62141a129f244703160d278648b569e90bb0e3753067644711"},
-    {file = "protobuf-4.23.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:2b94bd6df92d71bd1234a2ffe7ce96ddf6d10cf637a18d6b55ad0a89fbb7fc21"},
-    {file = "protobuf-4.23.0-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:9f5a0fbfcdcc364f3986f9ed9f8bb1328fb84114fd790423ff3d7fdb0f85c2d1"},
-    {file = "protobuf-4.23.0-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:ebde3a023b8e11bfa6c890ef34cd6a8b47d586f26135e86c21344fe433daf2e2"},
-    {file = "protobuf-4.23.0-cp37-cp37m-win32.whl", hash = "sha256:7cb5b9a05ce52c6a782bb97de52679bd3438ff2b7460eff5da348db65650f227"},
-    {file = "protobuf-4.23.0-cp37-cp37m-win_amd64.whl", hash = "sha256:6fe180b56e1169d72ecc4acbd39186339aed20af5384531b8e8979b02bbee159"},
-    {file = "protobuf-4.23.0-cp38-cp38-win32.whl", hash = "sha256:d5a35ff54e3f62e8fc7be02bb0d2fbc212bba1a5a9cc2748090690093996f07b"},
-    {file = "protobuf-4.23.0-cp38-cp38-win_amd64.whl", hash = "sha256:e62fb869762b4ba18666370e2f8a18f17f8ab92dd4467295c6d38be6f8fef60b"},
-    {file = "protobuf-4.23.0-cp39-cp39-win32.whl", hash = "sha256:03eee35b60317112a72d19c54d0bff7bc58ff12fea4cd7b018232bd99758ffdf"},
-    {file = "protobuf-4.23.0-cp39-cp39-win_amd64.whl", hash = "sha256:36f5370a930cb77c8ad2f4135590c672d0d2c72d4a707c7d0058dce4b4b4a598"},
-    {file = "protobuf-4.23.0-py3-none-any.whl", hash = "sha256:9744e934ea5855d12191040ea198eaf704ac78665d365a89d9572e3b627c2688"},
-    {file = "protobuf-4.23.0.tar.gz", hash = "sha256:5f1eba1da2a2f3f7df469fccddef3cc060b8a16cfe3cc65961ad36b4dbcf59c5"},
+    {file = "protobuf-4.23.1-cp310-abi3-win32.whl", hash = "sha256:410bcc0a5b279f634d3e16082ce221dfef7c3392fac723500e2e64d1806dd2be"},
+    {file = "protobuf-4.23.1-cp310-abi3-win_amd64.whl", hash = "sha256:32e78beda26d7a101fecf15d7a4a792278a0d26a31bc327ff05564a9d68ab8ee"},
+    {file = "protobuf-4.23.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f9510cac91e764e86acd74e2b7f7bc5e6127a7f3fb646d7c8033cfb84fd1176a"},
+    {file = "protobuf-4.23.1-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:346990f634272caac1f09efbcfbbacb23098b1f606d172534c6fa2d9758bb436"},
+    {file = "protobuf-4.23.1-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:3ce113b3f3362493bddc9069c2163a38f240a9ed685ff83e7bcb756b05e1deb0"},
+    {file = "protobuf-4.23.1-cp37-cp37m-win32.whl", hash = "sha256:2036a3a1e7fc27f973fa0a7888dce712393af644f4695385f117886abc792e39"},
+    {file = "protobuf-4.23.1-cp37-cp37m-win_amd64.whl", hash = "sha256:3b8905eafe4439076e1f58e9d1fa327025fd2777cf90f14083092ae47f77b0aa"},
+    {file = "protobuf-4.23.1-cp38-cp38-win32.whl", hash = "sha256:5b9cd6097e6acae48a68cb29b56bc79339be84eca65b486910bb1e7a30e2b7c1"},
+    {file = "protobuf-4.23.1-cp38-cp38-win_amd64.whl", hash = "sha256:decf119d54e820f298ee6d89c72d6b289ea240c32c521f00433f9dc420595f38"},
+    {file = "protobuf-4.23.1-cp39-cp39-win32.whl", hash = "sha256:91fac0753c3c4951fbb98a93271c43cc7cf3b93cf67747b3e600bb1e5cc14d61"},
+    {file = "protobuf-4.23.1-cp39-cp39-win_amd64.whl", hash = "sha256:ac50be82491369a9ec3710565777e4da87c6d2e20404e0abb1f3a8f10ffd20f0"},
+    {file = "protobuf-4.23.1-py3-none-any.whl", hash = "sha256:65f0ac96ef67d7dd09b19a46aad81a851b6f85f89725577f16de38f2d68ad477"},
+    {file = "protobuf-4.23.1.tar.gz", hash = "sha256:95789b569418a3e32a53f43d7763be3d490a831e9c08042539462b6d972c2d7e"},
 ]
 
 [[package]]
@@ -2675,26 +2831,6 @@ typing-extensions = ">=4.2.0"
 dotenv = ["python-dotenv (>=0.10.4)"]
 email = ["email-validator (>=1.0.3)"]
 
-[[package]]
-name = "pydeck"
-version = "0.8.0"
-description = "Widget for deck.gl maps"
-category = "dev"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "pydeck-0.8.0-py2.py3-none-any.whl", hash = "sha256:a8fa7757c6f24bba033af39db3147cb020eef44012ba7e60d954de187f9ed4d5"},
-    {file = "pydeck-0.8.0.tar.gz", hash = "sha256:07edde833f7cfcef6749124351195aa7dcd24663d4909fd7898dbd0b6fbc01ec"},
-]
-
-[package.dependencies]
-jinja2 = ">=2.10.1"
-numpy = ">=1.16.4"
-
-[package.extras]
-carto = ["pydeck-carto"]
-jupyter = ["ipykernel (>=5.1.2)", "ipython (>=5.8.0)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
-
 [[package]]
 name = "pygments"
 version = "2.15.1"
@@ -2832,14 +2968,14 @@ files = [
 
 [[package]]
 name = "pymdown-extensions"
-version = "9.11"
+version = "10.0.1"
 description = "Extension pack for Python Markdown."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pymdown_extensions-9.11-py3-none-any.whl", hash = "sha256:a499191d8d869f30339de86fcf072a787e86c42b6f16f280f5c2cf174182b7f3"},
-    {file = "pymdown_extensions-9.11.tar.gz", hash = "sha256:f7e86c1d3981f23d9dc43294488ecb54abadd05b0be4bf8f0e15efc90f7853ff"},
+    {file = "pymdown_extensions-10.0.1-py3-none-any.whl", hash = "sha256:ae66d84013c5d027ce055693e09a4628b67e9dec5bce05727e45b0918e36f274"},
+    {file = "pymdown_extensions-10.0.1.tar.gz", hash = "sha256:b44e1093a43b8a975eae17b03c3a77aad4681b3b56fce60ce746dbef1944c8cb"},
 ]
 
 [package.dependencies]
@@ -2871,19 +3007,31 @@ files = [
 ]
 
 [[package]]
-name = "pyparsing"
-version = "3.0.9"
-description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+name = "pypandoc-binary"
+version = "1.11"
+description = "Thin wrapper for pandoc."
 category = "dev"
 optional = false
-python-versions = ">=3.6.8"
+python-versions = ">=3.6"
 files = [
-    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
-    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
+    {file = "pypandoc_binary-1.11-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:ebd8036a71fb67c0d3bfc0f50a6af390afe0728ebe17b779f676fd25df76fca1"},
+    {file = "pypandoc_binary-1.11-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b0df61a96d679309769c21528cfbfb14d32ddee1854ae02e7b35b889d60d9e4"},
+    {file = "pypandoc_binary-1.11-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:49436f0ffa489f02bac546e4fe42cbd3595202ee3a00492616b8f6bc358119c0"},
+    {file = "pypandoc_binary-1.11-py3-none-win32.whl", hash = "sha256:a08a66f12d5672f75cea8f6c29b3579aa70bad4b0c8844efdb6e8f6fddc8b359"},
+    {file = "pypandoc_binary-1.11-py3-none-win_amd64.whl", hash = "sha256:1ab00de66b7f36ba33590415811c1d4c72d9f515c4e8b2f1391f27cbddc7b229"},
 ]
 
-[package.extras]
-diagrams = ["jinja2", "railroad-diagrams"]
+[[package]]
+name = "pyparsing"
+version = "2.4.7"
+description = "Python parsing module"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
+    {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
+]
 
 [[package]]
 name = "pyrsistent"
@@ -3006,6 +3154,21 @@ files = [
     {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
 ]
 
+[[package]]
+name = "pytz-deprecation-shim"
+version = "0.1.0.post0"
+description = "Shims to make deprecation of pytz easier"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+files = [
+    {file = "pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl", hash = "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6"},
+    {file = "pytz_deprecation_shim-0.1.0.post0.tar.gz", hash = "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"},
+]
+
+[package.dependencies]
+tzdata = {version = "*", markers = "python_version >= \"3.6\""}
+
 [[package]]
 name = "pywin32"
 version = "306"
@@ -3102,6 +3265,17 @@ pydantic = ">=1.8,<2.0"
 typing-extensions = ">=4.0.0,<5.0.0"
 urllib3 = ">=1.26.14,<2.0.0"
 
+[[package]]
+name = "red-black-tree-mod"
+version = "1.20"
+description = "Flexible python implementation of red black trees"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "red-black-tree-mod-1.20.tar.gz", hash = "sha256:2448e6fc9cbf1be204c753f352c6ee49aa8156dbf1faa57dfc26bd7705077e0a"},
+]
+
 [[package]]
 name = "regex"
 version = "2023.5.5"
@@ -3204,7 +3378,7 @@ files = [
 name = "requests"
 version = "2.30.0"
 description = "Python HTTP for Humans."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3222,6 +3396,22 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "requests-file"
+version = "1.5.1"
+description = "File transport adapter for Requests"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "requests-file-1.5.1.tar.gz", hash = "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e"},
+    {file = "requests_file-1.5.1-py2.py3-none-any.whl", hash = "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"},
+]
+
+[package.dependencies]
+requests = ">=1.0.0"
+six = "*"
+
 [[package]]
 name = "rfc3986"
 version = "1.5.0"
@@ -3259,6 +3449,26 @@ pygments = ">=2.6.0,<3.0.0"
 [package.extras]
 jupyter = ["ipywidgets (>=7.5.1,<8.0.0)"]
 
+[[package]]
+name = "rtfde"
+version = "0.0.2"
+description = "A library for extracting HTML content from RTF encapsulated HTML as commonly found in the exchange MSG email format."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "RTFDE-0.0.2-py3-none-any.whl", hash = "sha256:18386e4f060cee12a2a8035b0acf0cc99689f5dff1bf347bab7e92351860a21d"},
+    {file = "RTFDE-0.0.2.tar.gz", hash = "sha256:b86b5d734950fe8745a5b89133f50554252dbd67c6d1b9265e23ee140e7ea8a2"},
+]
+
+[package.dependencies]
+lark-parser = ">=0.11"
+oletools = ">=0.56"
+
+[package.extras]
+dev = ["lxml (>=4.6)"]
+msg-parse = ["extract-msg (>=0.27)"]
+
 [[package]]
 name = "scikit-learn"
 version = "1.2.2"
@@ -3440,7 +3650,7 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (
 name = "six"
 version = "1.16.0"
 description = "Python 2 and 3 compatibility utilities"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 files = [
@@ -3448,18 +3658,6 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
-[[package]]
-name = "smmap"
-version = "5.0.0"
-description = "A pure Python implementation of a sliding window memory map manager"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "smmap-5.0.0-py3-none-any.whl", hash = "sha256:2aba19d6a040e78d8b09de5c57e96207b09ed71d8e55ce0959eeee6c8e190d94"},
-    {file = "smmap-5.0.0.tar.gz", hash = "sha256:c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936"},
-]
-
 [[package]]
 name = "sniffio"
 version = "1.3.0"
@@ -3476,7 +3674,7 @@ files = [
 name = "soupsieve"
 version = "2.4.1"
 description = "A modern CSS selector implementation for Beautiful Soup."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -3579,7 +3777,7 @@ htbuilder = "*"
 [[package]]
 name = "streamlit"
 version = "1.22.0"
-description = "A faster way to build and share data apps"
+description = ""
 category = "dev"
 optional = false
 python-versions = ">=3.7, !=3.9.7"
@@ -3591,7 +3789,6 @@ altair = ">=3.2.0,<5"
 blinker = ">=1.0.0"
 cachetools = ">=4.0"
 click = ">=7.0"
-gitpython = "!=3.1.19"
 importlib-metadata = ">=1.4"
 numpy = "*"
 packaging = ">=14.1"
@@ -3599,22 +3796,17 @@ pandas = ">=0.25,<3"
 pillow = ">=6.2.0"
 protobuf = ">=3.20,<5"
 pyarrow = ">=4.0"
-pydeck = ">=0.1.dev5"
 pympler = ">=0.9"
 python-dateutil = "*"
 requests = ">=2.4"
 rich = ">=10.11.0"
 tenacity = ">=8.0.0,<9"
 toml = "*"
-tornado = ">=6.0.3"
 typing-extensions = ">=3.10.0.0"
 tzlocal = ">=1.1"
 validators = ">=0.2"
 watchdog = {version = "*", markers = "platform_system != \"Darwin\""}
 
-[package.extras]
-snowflake = ["snowflake-snowpark-python"]
-
 [package.source]
 type = "git"
 url = "https://github.com/hippalectryon-0/streamlit.git"
@@ -3804,6 +3996,21 @@ files = [
 [package.dependencies]
 mpmath = ">=0.19"
 
+[[package]]
+name = "tabulate"
+version = "0.9.0"
+description = "Pretty-print tabular data"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"},
+    {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"},
+]
+
+[package.extras]
+widechars = ["wcwidth"]
+
 [[package]]
 name = "tenacity"
 version = "8.2.2"
@@ -3831,6 +4038,24 @@ files = [
     {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"},
 ]
 
+[[package]]
+name = "tldextract"
+version = "3.4.2"
+description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tldextract-3.4.2-py3-none-any.whl", hash = "sha256:1f168f0a1c000a757aefed94a5bda10349c58976cb2ef1cc17c6e3b213440521"},
+    {file = "tldextract-3.4.2.tar.gz", hash = "sha256:98e36b0aa3a6d8fd084d80d75ae1372da02027efb556c146a59dfd14457071ba"},
+]
+
+[package.dependencies]
+filelock = ">=3.0.8"
+idna = "*"
+requests = ">=2.1.0"
+requests-file = ">=1.4"
+
 [[package]]
 name = "tokenizers"
 version = "0.13.3"
@@ -4013,27 +4238,6 @@ torch = "2.0.1"
 [package.extras]
 scipy = ["scipy"]
 
-[[package]]
-name = "tornado"
-version = "6.3.2"
-description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed."
-category = "dev"
-optional = false
-python-versions = ">= 3.8"
-files = [
-    {file = "tornado-6.3.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:c367ab6c0393d71171123ca5515c61ff62fe09024fa6bf299cd1339dc9456829"},
-    {file = "tornado-6.3.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b46a6ab20f5c7c1cb949c72c1994a4585d2eaa0be4853f50a03b5031e964fc7c"},
-    {file = "tornado-6.3.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2de14066c4a38b4ecbbcd55c5cc4b5340eb04f1c5e81da7451ef555859c833f"},
-    {file = "tornado-6.3.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:05615096845cf50a895026f749195bf0b10b8909f9be672f50b0fe69cba368e4"},
-    {file = "tornado-6.3.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b17b1cf5f8354efa3d37c6e28fdfd9c1c1e5122f2cb56dac121ac61baa47cbe"},
-    {file = "tornado-6.3.2-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:29e71c847a35f6e10ca3b5c2990a52ce38b233019d8e858b755ea6ce4dcdd19d"},
-    {file = "tornado-6.3.2-cp38-abi3-musllinux_1_1_i686.whl", hash = "sha256:834ae7540ad3a83199a8da8f9f2d383e3c3d5130a328889e4cc991acc81e87a0"},
-    {file = "tornado-6.3.2-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6a0848f1aea0d196a7c4f6772197cbe2abc4266f836b0aac76947872cd29b411"},
-    {file = "tornado-6.3.2-cp38-abi3-win32.whl", hash = "sha256:7efcbcc30b7c654eb6a8c9c9da787a851c18f8ccd4a5a3a95b05c7accfa068d2"},
-    {file = "tornado-6.3.2-cp38-abi3-win_amd64.whl", hash = "sha256:0c325e66c8123c606eea33084976c832aa4e766b7dff8aedd7587ea44a604cdf"},
-    {file = "tornado-6.3.2.tar.gz", hash = "sha256:4b927c4f19b71e627b13f3db2324e4ae660527143f9e1f2e2fb404f3a187e2ba"},
-]
-
 [[package]]
 name = "tqdm"
 version = "4.65.0"
@@ -4057,14 +4261,14 @@ telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.29.1"
+version = "4.29.2"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 category = "dev"
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "transformers-4.29.1-py3-none-any.whl", hash = "sha256:75f851f2420c26410edbdf4a2a1a5b434ab2b96aea36eb5931d06cc3b2e7b509"},
-    {file = "transformers-4.29.1.tar.gz", hash = "sha256:3dc9cd198918e140468edbf37d7edf3b7a75633655ce0771ce323bbf8c118c4d"},
+    {file = "transformers-4.29.2-py3-none-any.whl", hash = "sha256:0ef158b99bad6f4e6652a0d8655fbbe58b4cb788ce7040f320b5d29c7c810a75"},
+    {file = "transformers-4.29.2.tar.gz", hash = "sha256:ed9467661f459f1ce49461d83f18f3b36b6a37f306182dc2ba272935f3b93ebb"},
 ]
 
 [package.dependencies]
@@ -4187,21 +4391,23 @@ files = [
 
 [[package]]
 name = "tzlocal"
-version = "5.0"
+version = "4.2"
 description = "tzinfo object for the local timezone"
 category = "dev"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.6"
 files = [
-    {file = "tzlocal-5.0-py3-none-any.whl", hash = "sha256:c640e3fdccbb6fee1172ce211cefd3c3c04eaf2b0fbf676f0ac7958c41f373e4"},
-    {file = "tzlocal-5.0.tar.gz", hash = "sha256:f96e29a599ef562233cec21ef0d6f7065c3050d0221293e839d1ede093ab1755"},
+    {file = "tzlocal-4.2-py3-none-any.whl", hash = "sha256:89885494684c929d9191c57aa27502afc87a579be5cdd3225c77c463ea043745"},
+    {file = "tzlocal-4.2.tar.gz", hash = "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"},
 ]
 
 [package.dependencies]
+pytz-deprecation-shim = "*"
 tzdata = {version = "*", markers = "platform_system == \"Windows\""}
 
 [package.extras]
-devenv = ["black", "check-manifest", "flake8", "pyroma", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
+devenv = ["black", "pyroma", "pytest-cov", "zest.releaser"]
+test = ["pytest (>=4.3)", "pytest-mock (>=3.3)"]
 
 [[package]]
 name = "unstructured"
@@ -4248,7 +4454,7 @@ wikipedia = ["wikipedia"]
 name = "urllib3"
 version = "1.26.15"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 files = [
@@ -4339,6 +4545,29 @@ files = [
 [package.extras]
 watchmedo = ["PyYAML (>=3.10)"]
 
+[[package]]
+name = "wcwidth"
+version = "0.2.6"
+description = "Measures the displayed width of unicode strings in a terminal"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"},
+    {file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"},
+]
+
+[[package]]
+name = "win-unicode-console"
+version = "0.5"
+description = "Enable Unicode input and display when running Python from Windows console."
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "win_unicode_console-0.5.zip", hash = "sha256:d4142d4d56d46f449d6f00536a73625a871cba040f0bc1a2e305a04578f07d1e"},
+]
+
 [[package]]
 name = "wrapt"
 version = "1.14.1"
@@ -4429,7 +4658,7 @@ files = [
 name = "yarl"
 version = "1.9.2"
 description = "Yet another URL library"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -4531,5 +4760,5 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 
 [metadata]
 lock-version = "2.0"
-python-versions = ">3.9.7,<3.12"
-content-hash = "bbb551ef93af03b0dbb9b0a454e460ef1ce65182fe66b3bde35e6f89df1760de"
+python-versions = ">=3.10,<3.12"
+content-hash = "b77ebdc2434b6e29302ef1a8b3b9cdefed765ddd745a002c641a8e29f8d56146"
diff --git a/pyproject.toml b/pyproject.toml
index ae5fa22..b6cc2ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,16 +1,17 @@
 [tool.poetry]
 name = "casalioy"
-version = "1.6.0"
+version = "0.1.1"
 description = "Your local langchain toolkit"
 authors = ["su77ungr"]
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = ">3.9.7,<3.12"
+python = ">=3.10,<3.12"
 pdfminer-six = "20221105"
 python-dotenv = "^1.0.0"
-pathlib = "^1.0.1"
 sentencepiece = "^0.1.99"  # For convert.py
+prompt-toolkit = "^3.0"  # Nice terminal output
+libgenesis = "^0.1.9"  # download docs
 
 [tool.poetry.group.GUI.dependencies]  # for the GUI
 streamlit = { git = "https://github.com/hippalectryon-0/streamlit.git", subdirectory = "lib", "rev" = "0b7fb1c" }  # waiting for >1.22.0 so that we can use protobuf>=4
@@ -18,11 +19,14 @@ streamlit-chat = "^0.0.2.2"
 streamlit-extras = "^0.2.7"
 
 [tool.poetry.group.LLM.dependencies]
-langchain = "^0.0.168"
+langchain = "^0.0.171"
 pygpt4all = "^1.1.0"
 qdrant-client = "^1.1.7"
 unstructured = "^0.6.6"  # Handle ingestion file formats
-docx2txt="^0.8"  # Handle docx ingestion file formats
+pypandoc-binary = "^1.11"  # doc conversion
+docx2txt = "^0.8"  # Handle docx ingestion file formats
+tabulate = "^0.9.0"  # Also required for docx
+extract-msg = "^0.41.1"  # Handle email file formats
 llama-cpp-python = "^0.1.50"  # 0.1.50 raises an AssertionError / NameError on <5 vic models
 sentence_transformers = "^2.2.2"  # doesn't install torch properly with poetry, but should be better in later versions
 
diff --git a/startLLM.py b/startLLM.py
deleted file mode 100644
index 47a0d62..0000000
--- a/startLLM.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""start the local LLM"""
-
-import qdrant_client
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from langchain.chains import RetrievalQA
-from langchain.vectorstores import Qdrant
-
-from load_env import (
-    chain_type,
-    get_embedding_model,
-    get_prompt_template_kwargs,
-    model_n_ctx,
-    model_path,
-    model_stop,
-    model_temp,
-    model_type,
-    n_gpu_layers,
-    persist_directory,
-    use_mlock,
-)
-
-
-def initialize_qa_system() -> RetrievalQA:
-    """init the LLM"""
-    # Get embeddings and local vector store
-    embeddings = get_embedding_model()[0]
-    client = qdrant_client.QdrantClient(path=persist_directory, prefer_grpc=True)
-    qdrant = Qdrant(client=client, collection_name="test", embeddings=embeddings)
-
-    # Prepare the LLM chain
-    callbacks = [StreamingStdOutCallbackHandler()]
-    match model_type:
-        case "LlamaCpp":
-            from langchain.llms import LlamaCpp
-
-            llm = LlamaCpp(
-                model_path=model_path,
-                n_ctx=model_n_ctx,
-                temperature=model_temp,
-                stop=model_stop,
-                callbacks=callbacks,
-                verbose=True,
-                n_threads=6,
-                n_batch=1000,
-                use_mlock=use_mlock,
-            )
-            # Need this hack because this param isn't yet supported by the python lib
-            state = llm.client.__getstate__()
-            state["n_gpu_layers"] = n_gpu_layers
-            llm.client.__setstate__(state)
-        case "GPT4All":
-            from langchain.llms import GPT4All
-
-            llm = GPT4All(
-                model=model_path,
-                n_ctx=model_n_ctx,
-                callbacks=callbacks,
-                verbose=True,
-                backend="gptj",
-            )
-        case _:
-            raise ValueError("Only LlamaCpp or GPT4All supported right now. Make sure you set up your .env correctly.")
-
-    return RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type=chain_type,
-        retriever=qdrant.as_retriever(search_type="mmr"),
-        return_source_documents=True,
-        chain_type_kwargs=get_prompt_template_kwargs(),
-    )
-
-
-# noinspection PyMissingOrEmptyDocstring
-def main() -> None:
-    qa_system = initialize_qa_system()
-    # Interactive questions and answers
-    while True:
-        query = input("\nEnter a query: ").strip()
-        if query == "exit":
-            break
-        elif not query:  # check if query empty
-            print("Empty query, skipping")
-            continue
-
-        # Get the answer from the chain
-        res = qa_system(query)
-        answer, docs = res["result"], res["source_documents"]
-
-        # Print the result
-        sources_str = "\n\n".join(f">> {document.metadata['source']}:\n{document.page_content}" for document in docs)
-        print(
-            f"""\n\n> Question: {query}
-> Answer: {answer}
-> Sources:\n{sources_str}"""
-        )
-
-
-if __name__ == "__main__":
-    main()