update plugins

microsoft · Oct 16, 2023 · 6f23864 · 6f23864
1 parent efa2d4c
commit 6f23864
Show file tree

Hide file tree

Showing 20 changed files with 511 additions and 266 deletions.
diff --git a/copilot/.env → agent/.env b/copilot/.env → agent/.env
diff --git a/copilot/README.md → agent/README.md b/copilot/README.md → agent/README.md
@@ -1,4 +1,4 @@
-<!-- <p align="center"> <b> Music Pilot </b> </p> -->
+<!-- <p align="center"> <b> Music Agent </b> </p> -->
 
 <div align="center">
 
@@ -9,13 +9,16 @@
 
 ## Demo Video
 
-![Download demo video](https://drive.google.com/file/d/1W0iJPHNPA6ENLJrPef0vtQytboSubxXe/view?usp=sharing)
+[![Watch the video](https://img.youtube.com/vi/tpNynjdcBqA/maxresdefault.jpg)](https://youtu.be/tpNynjdcBqA)
 
 ## Features
 
-- Accessibility: Music Pilot dynamically selects the most appropriate methods for each music-related task.
-- Unity: Music Pilot unifies a wide array of tools into a single system, incorporating Huggingface models, GitHub projects, and Web APIs.
-- Modularity: Music Pilot offers high modularity, allowing users to effortlessly enhance its capabilities by integrating new functions.
+- Accessibility: Music Agent dynamically selects the most appropriate methods for each music-related task.
+- Unity: Music Agent unifies a wide array of tools into a single system, incorporating Huggingface models, GitHub projects, and Web APIs.
+- Modularity: Music Agent offers high modularity, allowing users to effortlessly enhance its capabilities by integrating new functions.
+
+## Skills
+
 
 ## Installation
 
@@ -38,19 +41,20 @@ sudo apt-get install -y git-lfs
 sudo apt-get install -y libsndfile1-dev
 sudo apt-get install -y fluidsynth
 sudo apt-get install -y ffmpeg
+sudo apt-get install -y lilypond
 
-# Clone the repository from TODO 
-git clone https://github.com/TODO
-cd DIR
+# Clone the repository from muzic
+git clone https://github.com/muzic
+cd muzic/agent
 ```
 
 Next, install the dependent libraries. There might be some conflicts, but they should not affect the functionality of the system.
 
 ```bash
 pip install --upgrade pip
 
-pip install -r requirements.txt
 pip install semantic-kernel
+pip install -r requirements.txt
 pip install numpy==1.23.0
 pip install protobuf==3.20.3
 ```

diff --git a/copilot/agent.py → agent/agent.py b/copilot/agent.py → agent/agent.py
@@ -18,7 +18,7 @@
 from model_utils import lyric_format
 from plugins import get_task_map, init_plugins
 
-class MusicPilotAgent:
+class MusicAgent:
     """
     Attributes:
         config_path: A path to a YAML file, referring to the example config.yaml
@@ -64,7 +64,7 @@ def _init_logger(self):
 
     def _init_semantic_kernel(self):
         skills_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "skills")
-        pilot_funcs = self.kernel.import_semantic_skill_from_directory(skills_directory, "MusicPilot")
+        pilot_funcs = self.kernel.import_semantic_skill_from_directory(skills_directory, "MusicAgent")
 
         # task planning
         self.task_planner = pilot_funcs["TaskPlanner"]
@@ -168,6 +168,9 @@ def collect_result(self, command, choose, inference_result):
         return result
 
     def run_task(self, input_text, command, results):
+        if self.error_event.is_set():
+            return
+
         id = command["id"]
         args = command["args"]
         task = command["task"]
@@ -226,7 +229,7 @@ def run_task(self, input_text, command, results):
             inference_result = []
 
             for arg in command["args"]:
-                chat_input = f"[{input_text}] contains a task in JSON format {command}. Now you are a {command['task']} system, the arguments are {arg}. Just help me do {command['task']} and give me the resultwithout any additional description. The result must be in text form without any urls."
+                chat_input = f"[{input_text}] contains a task in JSON format {command}. Now you are a {command['task']} system, the arguments are {arg}. Just help me do {command['task']} and give me the result without any additional description."
                 response = self.skillchat(chat_input, self.chatbot, self.chat_context)
                 inference_result.append({"lyric":lyric_format(response)})
 
@@ -263,7 +266,12 @@ def run_task(self, input_text, command, results):
             inference_result = self.model_inference(best_model_id, command, device=self.config["device"])
 
         results[id] = self.collect_result(command, choose, inference_result)
-        return True
+        for result in inference_result:
+            if "error" in result:
+                self.error_event.set()
+                break
+
+        return
 
     def chat(self, input_text):
         start = time.time()
@@ -277,19 +285,22 @@ def chat(self, input_text):
         except Exception as e:
             self.logger.debug(e)
             response = self.skillchat(input_text, self.chatbot, self.chat_context)
-            return response
+            return response, {"0": "Task parsing error, reply using ChatGPT."}
 
         if len(tasks) == 0:
             response = self.skillchat(input_text, self.chatbot, self.chat_context)
-            return response
+            return response, {"0": "No task detected, reply using ChatGPT."}
 
         tasks = self.fix_depth(tasks)
         results = {}
         threads = []
         d = dict()
         retry = 0
+        self.error_event = threading.Event()
         while True:
             num_thread = len(threads)
+            if self.error_event.is_set():
+                break
             for task in tasks:
                 # logger.debug(f"d.keys(): {d.keys()}, dep: {dep}")
                 for dep_id in task["dep"]:
@@ -326,21 +337,21 @@ def chat(self, input_text):
         end = time.time()
         during = end - start
         self.logger.info(f"time: {during}s")
-        return response
+        return response, results
 
 def parse_args():
-    parser = argparse.ArgumentParser(description="A path to a YAML file")
+    parser = argparse.ArgumentParser(description="music agent config")
     parser.add_argument("--config", type=str, help="a YAML file path.")
 
     args = parser.parse_args()
     return args
 
 if __name__ == "__main__":
     args = parse_args()
-    agent = MusicPilotAgent(args.config, mode="cli")
+    agent = MusicAgent(args.config, mode="cli")
     print("Input exit or quit to stop the agent.")
     while True:
-        message = input("Send a message: ")
+        message = input("User input: ")
         if message in ["exit", "quit"]:
             break
 

diff --git a/copilot/auxiliary/muzic/roc/main.py → agent/auxiliary/muzic/roc/main.py b/copilot/auxiliary/muzic/roc/main.py → agent/auxiliary/muzic/roc/main.py
diff --git a/copilot/config.yaml → agent/config.yaml b/copilot/config.yaml → agent/config.yaml
diff --git a/agent/gradio_agent.py b/agent/gradio_agent.py
@@ -0,0 +1,205 @@
+import uuid
+import os
+import gradio as gr
+import re
+import requests
+from agent import MusicAgent
+import soundfile
+import argparse
+
+
+all_messages = []
+OPENAI_KEY = ""
+
+
+def add_message(content, role):
+    message = {"role": role, "content": content}
+    all_messages.append(message)
+
+
+def extract_medias(message):
+    # audio_pattern = re.compile(r"(http(s?):|\/)?([\.\/_\w:-])*?\.(flac|wav|mp3)")
+    audio_pattern = re.compile(r"(http(s?):|\/)?[a-zA-Z0-9\/.:-]*\.(flac|wav|mp3)")
+    symbolic_button = re.compile(r"(http(s?):|\/)?[a-zA-Z0-9\/.:-]*\.(mid)")
+
+    audio_urls = []
+    for match in audio_pattern.finditer(message):
+        if match.group(0) not in audio_urls:
+            audio_urls.append(match.group(0))
+
+    symbolic_urls = []
+    for match in symbolic_button.finditer(message):
+        if match.group(0) not in symbolic_urls:
+            symbolic_urls.append(match.group(0))
+
+    return list(set(audio_urls)), list(set(symbolic_urls))
+
+
+def set_openai_key(openai_key):
+    global OPENAI_KEY
+    OPENAI_KEY = openai_key
+    agent._init_backend_from_input(openai_key)
+    if not OPENAI_KEY.startswith("sk-"):
+        return "OpenAI API Key starts with sk-", gr.update(visible=False)
+
+    return OPENAI_KEY, gr.update(visible=True)
+
+
+def add_text(messages, message):
+    add_message(message, "user")
+    messages = messages + [(message, None)]
+    audio_urls, _ = extract_medias(message)
+
+    for audio_url in audio_urls:
+        if audio_url.startswith("http"):
+            ext = audio_url.split(".")[-1]
+            name = f"{str(uuid.uuid4()[:4])}.{ext}"
+            response = requests.get(audio_url)
+            with open(f"{agent.config['src_fold']}/{name}", "wb") as f:
+                f.write(response.content)
+            messages = messages + [(None, f"{audio_url} is saved as {name}")]
+
+    return messages, ""
+
+
+def upload_audio(file, messages):
+    file_name = str(uuid.uuid4())[:4]
+    audio_load, sr = soundfile.read(file.name)
+    soundfile.write(f"{agent.config['src_fold']}/{file_name}.wav", audio_load, samplerate=sr)
+
+    messages = messages + [(None, f"Audio is stored in wav format as ** {file_name}.wav **"), 
+                           (None, (f"{agent.config['src_fold']}/{file_name}.wav",))]
+    return messages
+
+
+def bot(messages):
+    message, results = agent.chat(messages[-1][0])
+
+    audio_urls, symbolic_urls = extract_medias(message)
+    add_message(message, "assistant")
+    messages[-1][1] = message
+    for audio_url in audio_urls:
+        if not audio_url.startswith("http") and not audio_url.startswith(agent.config['src_fold']):
+            audio_url =  os.path.join(agent.config['src_fold'], audio_url)
+        messages = messages + [(None, f"** {audio_url.split('/')[-1]} **"),
+                                (None, (audio_url,))]
+
+    for symbolic_url in symbolic_urls:
+        if not symbolic_url.startswith(agent.config['src_fold']):
+            symbolic_url = os.path.join(agent.config['src_fold'], symbolic_url)
+
+        try:
+            os.system(f"midi2ly {symbolic_url} -o {symbolic_url}.ly; lilypond -f png -o {symbolic_url} {symbolic_url}.ly")
+        except:
+            continue
+        messages = messages + [(None, f"** {symbolic_url.split('/')[-1]} **")]
+
+        if os.path.exists(f"{symbolic_url}.png"):
+            messages = messages + [ (None, (f"{symbolic_url}.png",))]
+        else:
+            s_page = 1
+            while os.path.exists(f"{symbolic_url}-page{s_page}.png"):
+                messages = messages + [ (None, (f"{symbolic_url}-page{s_page}.png",))]
+                s_page += 1
+
+    def truncate_strings(obj, max_length=128):
+        if isinstance(obj, str):
+            if len(obj) > max_length:
+                return obj[:max_length] + "..."
+            else:
+                return obj
+        elif isinstance(obj, dict):
+            return {key: truncate_strings(value, max_length) for key, value in obj.items()}
+        elif isinstance(obj, list):
+            return [truncate_strings(item, max_length) for item in obj]
+        else:
+            return obj
+
+    results = truncate_strings(results)
+    results = sorted(results.items(), key=lambda x: int(x[0]))
+    response = [(None, "\n\n".join([f"Subtask {r[0]}:\n{r[1]}" for r in results]))]
+
+    return messages, response
+
+
+def clear_all_history(messages):
+    agent.clear_history()
+
+    messages = messages + [((None, "All LLM history cleared"))]
+    return messages
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="music agent config")
+    parser.add_argument("-c", "--config", type=str, help="a YAML file path.")
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    agent = MusicAgent(args.config, mode="gradio")
+
+    with gr.Blocks() as demo:
+        gr.HTML("""
+                <h1 align="center" style=" display: flex; flex-direction: row; justify-content: center; font-size: 25pt; ">🎧 Music Agent</h1>
+                <h3>This is a demo page for Music Agent, a project that uses LLM to integrate music tools. For specific functions, please refer to the examples given below, or refer to the instructions in Github.</h3>
+                <h3>Make sure the uploaded audio resource is in flac|wav|mp3 format.</h3>
+                <h3>Due to RPM limitations, Music Agent requires an OpenAI key for the paid version.</h3>
+                <div style="display: flex;"><a href='https://github.com/microsoft/muzic/tree/main/copilot'><img src='https://img.shields.io/badge/Github-Code-blue'></a></div>
+                """)
+
+        with gr.Row():
+            openai_api_key = gr.Textbox(
+                show_label=False,
+                placeholder="Set your OpenAI API key here and press Enter",
+                lines=1,
+                type="password",
+            )
+            state = gr.State([])
+
+        with gr.Row(visible=False) as interact_window:
+
+            with gr.Column(scale=0.7, min_width=500):
+                chatbot = gr.Chatbot([], elem_id="chatbot", label="Music-Agent Chatbot").style(height=500)
+
+                with gr.Tab("User Input"):
+                    with gr.Row(scale=1):
+                        with gr.Column(scale=0.6):
+                            txt = gr.Textbox(show_label=False, placeholder="Press ENTER or click the Run button. You can start by asking 'What can you do?'").style(container=False)
+                        with gr.Column(scale=0.1, min_width=0):
+                            run = gr.Button("🏃‍♂️Run")
+                        with gr.Column(scale=0.1, min_width=0):
+                            clear_txt = gr.Button("🔄Clear️")
+                        with gr.Column(scale=0.2, min_width=0):
+                            btn = gr.UploadButton("☁️Upload Audio", file_types=["audio"])
+
+            with gr.Column(scale=0.3, min_width=300):
+                with gr.Tab("Intermediate Results"):
+                    response = gr.Chatbot([], label="Current Progress").style(height=400)
+
+        openai_api_key.submit(set_openai_key, [openai_api_key], [openai_api_key, interact_window])
+        clear_txt.click(clear_all_history, [chatbot], [chatbot])
+
+        btn.upload(upload_audio, [btn, chatbot], [chatbot])
+        run.click(add_text, [chatbot, txt], [chatbot, txt]).then(
+            bot, chatbot, [chatbot, response]
+        )
+        txt.submit(add_text, [chatbot, txt], [chatbot, txt]).then(
+            bot, chatbot, [chatbot, response]
+        )
+
+        gr.Examples(
+            examples=["What can you do?",
+                        "Write a piece of lyric about the recent World Cup.",
+                        "生成一首古风歌词的中文歌",
+                        "Download a song by Jay Chou for me and separate the vocals and the accompanies.",
+                        "Convert the vocals in /b.wav to a violin sound.",
+                        "Give me the sheet music and lyrics in the song /a.wav",
+                        "近一个月流行的音乐类型",
+                        "把c.wav中的人声搭配合适的旋律变成一首歌"
+                        ],
+            inputs=txt
+        )
+
+    demo.launch(share=True)
diff --git a/copilot/model_utils.py → agent/model_utils.py b/copilot/model_utils.py → agent/model_utils.py
diff --git a/copilot/models/download.sh → agent/models/download.sh b/copilot/models/download.sh → agent/models/download.sh
@@ -3,8 +3,11 @@
 # Set models to download
 models=(
     "m3hrdadfi/wav2vec2-base-100k-gtzan-music-genres"
+	"lewtun/distilhubert-finetuned-music-genres"
+	"dima806/music_genres_classification"
     "sander-wood/text-to-music"
     "jonatasgrosman/whisper-large-zh-cv11"
+	"cvssp/audioldm-m-full"
 )
 
 # Set the current directory