SQL-migration-assistant (#238)

databrickslabs · Sep 6, 2024 · 62947b6 · 62947b6
1 parent adde807
commit 62947b6
Show file tree

Hide file tree

Showing 47 changed files with 2,434 additions and 2 deletions.
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -13,6 +13,7 @@ go-libs                    @nfx
 ip_access_list_analyzer    @alexott
 metascan                   @nfx
 runtime-packages           @nfx
+sql_migration_copilot      @robertwhiffin
 tacklebox                  @Jonathan-Choi
 uc-catalog-cloning         @esiol-db @vasco-lopes
 .github                    @nfx
diff --git a/cli.py b/cli.py
@@ -10,9 +10,13 @@ def ip_access_list_analyzer(**args):
     import ip_access_list_analyzer.ip_acl_analyzer as analyzer
     analyzer.main(args)
 
+def sql_migration_assistant(**args):
+    from sql_migration_assistant import hello
+    hello()
 
 MAPPING = {
-    "ip-access-list-analyzer": ip_access_list_analyzer
+    "ip-access-list-analyzer": ip_access_list_analyzer,
+    "sql-migration-assistant": sql_migration_assistant
 }
 
 

diff --git a/labs.yml b/labs.yml
@@ -1,7 +1,6 @@
 ---
 name: sandbox
 install:
-  min_runtime_version: 13.1
   script: install.py
 description: Databricks Labs Sandbox
 entrypoint: cli.py
@@ -17,3 +16,12 @@ commands:
       - name: apply
         description: "If script should do the changes"
         default: false
+  - name: sql-migration-assistant
+    description: "GenAI enabled SQL Migration tool"
+    flags:
+      - name: json_file
+        description: "Optional JSON file with dump of IP Access Lists"
+        default: ''
+      - name: apply
+        description: "If script should do the changes"
+        default: false
diff --git a/sql_migration_assistant/.gitignore b/sql_migration_assistant/.gitignore
@@ -0,0 +1 @@
+.databrickscfg
diff --git a/sql_migration_assistant/README.md b/sql_migration_assistant/README.md
@@ -0,0 +1,68 @@
+---
+title: Project Legion - SQL Migration Assistant
+language: python
+author: Robert Whiffin
+date: 2024-08-28
+
+tags:
+  - SQL
+  - Migration
+  - copilot
+  - GenAi
+
+---
+
+# Project Legion - SQL Migration Assistant
+
+Legion is a Databricks field project to accelerate migrations on to Databricks leveraging the platform’s generative AI
+capabilities. It uses an LLM for code conversion and intent summarisation, presented to users in a front end web 
+application.
+
+Legion provides a chatbot interface to users for translating input code (for example T-SQL to Databricks SQL) and 
+summarising the intent and business purpose of the code. This intent is then embedded for serving in a Vector Search
+index for finding similar pieces of code. This presents an opportunity for increased collaboration (find out who is
+working on similar projects), rationalisation (identify duplicates based on intent) and discoverability (semantic search).
+
+Legion is a solution accelerator - it is *not* a fully baked solution. This is something for you the customer to take 
+on and own. This allows you to present a project to upskill your employees, leverage GenAI for a real use case, 
+customise the application to their needs and entirely own the IP.
+
+## Installation Videos
+
+https://github.com/user-attachments/assets/b43372fb-95ea-49cd-9a2c-aec8e0d6700f
+
+https://github.com/user-attachments/assets/fa622f96-a78c-40b8-9eb9-f6671c4d7b47
+
+https://github.com/user-attachments/assets/1a58a1b5-2dcf-4624-b93f-214735162584
+
+
+
+Setting Legion up is a simple and automated process. Ensure you have the [Databricks CLI]
+(https://docs.databricks.com/en/dev-tools/cli/index.html) installed and configured with the correct workspace. Install 
+the [Databricks Labs Sandbox](https://github.com/databrickslabs/sandbox). 
+
+First, navigate to where you have installed the Databricks Labs Sandbox. For example
+```bash
+cd /Documents/sandbox
+```
+
+You'll need to install the python requirements in the `requirements.txt` file in the root of the project. 
+You may wish to do this in a virtual environment. 
+```bash
+pip install -r sql-migration-assistant/requirements.txt -q
+```
+Run the following command to start the installation process, creating all the necessary resources in your workspace.
+```bash 
+databricks labs sandbox sql-migration-assistant
+```
+
+### What Legion needs - during setup above you will create or choose existing resources for the following:
+
+- A no-isolation shared cluster running the ML runtime (tested on DBR 15.0 ML) to host the front end application.
+- A catalog and schema in Unity Catalog. 
+- A table to store the code intent statements and their embeddings.
+- A vector search endpoint and an embedding model: see docs 
+https://docs.databricks.com/en/generative-ai/vector-search.html#how-to-set-up-vector-search
+- A chat LLM. Pay Per Token is recomended where available, but the set up will also allow for creation of 
+a provisioned throughput endpoint.
+- A PAT stored in a secret scope chosen by you, under the key `sql-migration-pat`.
diff --git a/sql_migration_assistant/__init__.py b/sql_migration_assistant/__init__.py
@@ -0,0 +1,15 @@
+from sql_migration_assistant.utils.initialsetup import SetUpMigrationAssistant
+from databricks.sdk import WorkspaceClient
+from databricks.labs.blueprint.tui import Prompts
+import yaml
+
+
+def hello():
+    w = WorkspaceClient(product="sql_migration_assistant", product_version="0.0.1")
+    p = Prompts()
+    setter_upper = SetUpMigrationAssistant()
+    final_config = setter_upper.setup_migration_assistant(w, p)
+    with open("sql_migration_assistant/config.yml", "w") as f:
+        yaml.dump(final_config, f)
+    setter_upper.upload_files(w)
+    setter_upper.launch_review_app(w, final_config)
diff --git a/sql_migration_assistant/app/__init__.py b/sql_migration_assistant/app/__init__.py
diff --git a/sql_migration_assistant/app/llm.py b/sql_migration_assistant/app/llm.py
@@ -0,0 +1,84 @@
+import logging
+
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.service.serving import ChatMessage, ChatMessageRole
+
+w = WorkspaceClient()
+foundation_llm_name = "databricks-meta-llama-3-1-405b-instruct"
+max_token = 4096
+messages = [
+    ChatMessage(role=ChatMessageRole.SYSTEM, content="You are an unhelpful assistant"),
+    ChatMessage(role=ChatMessageRole.USER, content="What is RAG?"),
+]
+
+
+class LLMCalls:
+    def __init__(self, foundation_llm_name, max_tokens):
+        self.w = WorkspaceClient()
+        self.foundation_llm_name = foundation_llm_name
+        self.max_tokens = int(max_tokens)
+
+    def call_llm(self, messages):
+        """
+        Function to call the LLM model and return the response.
+        :param messages: list of messages like
+            messages=[
+                       ChatMessage(role=ChatMessageRole.SYSTEM, content="You are an unhelpful assistant"),
+                        ChatMessage(role=ChatMessageRole.USER, content="What is RAG?"),
+                        ChatMessage(role=ChatMessageRole.ASSISTANT, content="A type of cloth?")
+                    ]
+        :return: the response from the model
+        """
+        response = self.w.serving_endpoints.query(
+            name=foundation_llm_name, max_tokens=max_token, messages=messages
+        )
+        message = response.choices[0].message.content
+        return message
+
+    def convert_chat_to_llm_input(self, system_prompt, chat):
+        # Convert the chat list of lists to the required format for the LLM
+        messages = [ChatMessage(role=ChatMessageRole.SYSTEM, content=system_prompt)]
+        for q, a in chat:
+            messages.extend(
+                [
+                    ChatMessage(role=ChatMessageRole.USER, content=q),
+                    ChatMessage(role=ChatMessageRole.ASSISTANT, content=a),
+                ]
+            )
+        return messages
+
+    ################################################################################
+    # FUNCTION FOR TRANSLATING CODE
+    ################################################################################
+
+    # this is called to actually send a request and receive response from the llm endpoint.
+
+    def llm_translate(self, system_prompt, input_code):
+        messages = [
+            ChatMessage(role=ChatMessageRole.SYSTEM, content=system_prompt),
+            ChatMessage(role=ChatMessageRole.USER, content=input_code),
+        ]
+
+        # call the LLM end point.
+        llm_answer = self.call_llm(messages=messages)
+        # Extract the code from in between the triple backticks (```), since LLM often prints the code like this.
+        # Also removes the 'sql' prefix always added by the LLM.
+        translation = llm_answer  # .split("Final answer:\n")[1].replace(">>", "").replace("<<", "")
+        return translation
+
+    def llm_chat(self, system_prompt, query, chat_history):
+        messages = self.convert_chat_to_llm_input(system_prompt, chat_history)
+        messages.append(ChatMessage(role=ChatMessageRole.USER, content=query))
+        # call the LLM end point.
+        llm_answer = self.call_llm(messages=messages)
+        return llm_answer
+
+    def llm_intent(self, system_prompt, input_code):
+        messages = [
+            ChatMessage(role=ChatMessageRole.SYSTEM, content=system_prompt),
+            ChatMessage(role=ChatMessageRole.USER, content=input_code),
+        ]
+
+        # call the LLM end point.
+        llm_answer = self.call_llm(messages=messages)
+        return llm_answer
diff --git a/sql_migration_assistant/app/similar_code.py b/sql_migration_assistant/app/similar_code.py
@@ -0,0 +1,40 @@
+from databricks.sdk import WorkspaceClient
+from databricks.labs.lsql.core import StatementExecutionExt
+
+
+class SimilarCode:
+
+    def __init__(
+        self,
+        workspace_client: WorkspaceClient,
+        see: StatementExecutionExt,
+        catalog,
+        schema,
+        code_intent_table_name,
+        VS_index_name,
+        VS_endpoint_name,
+    ):
+        self.w = workspace_client
+        self.see = see
+        self.catalog = catalog
+        self.schema = schema
+        self.code_intent_table_name = code_intent_table_name
+        self.vs_index_name = VS_index_name
+        self.vs_endpoint_name = VS_endpoint_name
+
+    def save_intent(self, code, intent):
+        code_hash = hash(code)
+        _ = self.see.execute(
+            f'INSERT INTO {self.catalog}.{self.schema}.{self.code_intent_table_name} VALUES ({code_hash}, "{code}", "{intent}")',
+        )
+
+    def get_similar_code(self, chat_history):
+        intent = chat_history[-1][1]
+        results = self.w.vector_search_indexes.query_index(
+            index_name=f"{self.catalog}.{self.schema}.{self.vs_index_name}",
+            columns=["code", "intent"],
+            query_text=intent,
+            num_results=1,
+        )
+        docs = results.result.data_array
+        return (docs[0][0], docs[0][1])
diff --git a/sql_migration_assistant/docs/Installation Video 1.mov b/sql_migration_assistant/docs/Installation Video 1.mov
diff --git a/sql_migration_assistant/docs/Installation Video 2.mov b/sql_migration_assistant/docs/Installation Video 2.mov
diff --git a/sql_migration_assistant/docs/Installation Video 3.mov b/sql_migration_assistant/docs/Installation Video 3.mov
diff --git a/sql_migration_assistant/docs/Makefile b/sql_migration_assistant/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)