feat(templates): extract text from multimodal requests (#3866)

When offloading template construction to the backend, we want to keep text around in case of multimodal requests. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
mudler · Oct 17, 2024 · d5da8c3 · d5da8c3
1 parent 9db0683
commit d5da8c3
Showing 1 changed file with 11 additions and 0 deletions.
diff --git a/core/backend/llm.go b/core/backend/llm.go
@@ -2,6 +2,7 @@ package backend
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"os"
 	"regexp"
@@ -77,6 +78,16 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			switch ct := message.Content.(type) {
 			case string:
 				protoMessages[i].Content = ct
+			case []interface{}:
+				// If using the tokenizer template, in case of multimodal we want to keep the multimodal content as and return only strings here
+				data, _ := json.Marshal(ct)
+				resultData := []struct {
+					Text string `json:"text"`
+				}{}
+				json.Unmarshal(data, &resultData)
+				for _, r := range resultData {
+					protoMessages[i].Content += r.Text
+				}
 			default:
 				return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
 			}