Merge commit '6e0fb055d18969eb923e719ad92ecac3a5c5d534'

* commit '6e0fb055d18969eb923e719ad92ecac3a5c5d534': (42 commits) chore: bump version to 0.15.1 (langgenius#12690) feat: add table of contents to Knowledge API doc (langgenius#12688) [fix] support feature restore (langgenius#12563) api tool support multiple env url (langgenius#12249) Add new integration with Opik Tracking tool (langgenius#11501) fix: add type hints for App model and improve error handling in audio services (langgenius#12677) fix: Update variable handling in VariableAssignerNode and clean up app_dsl_service (langgenius#12672) Revert "Feat/new saas billing" (langgenius#12673) fix(workflow): fix answer node stream processing in conditional branches (langgenius#12510) fix: ruff with statements (langgenius#12578) fix: ruff check for True if ... else (langgenius#12576) chore: Adjust translations to align with Taiwanese Mandarin conventions (langgenius#12633) Fix pandas indexing method for knowledge base imports (langgenius#12637) (langgenius#12638) Feat/new saas billing (langgenius#12591) improve the readability of the function generate_api_key (langgenius#12552) chore: translate i18n files (langgenius#12543) Feat/add knowledge include all filter (langgenius#12537) fix: Add datasets list access control and fix datasets config display issue (langgenius#12533) fix: sum costs return error value on overview page (langgenius#12534) feat: show workflow running status (langgenius#12531) ... # Conflicts: # api/poetry.lock
yybht155 · Jan 14, 2025 · 1597a07 · 1597a07
2 parents 9a4ecc0 + 6e0fb05
commit 1597a07
Show file tree

Hide file tree

Showing 147 changed files with 2,930 additions and 744 deletions.
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
@@ -82,6 +82,33 @@ jobs:
         if: steps.changed-files.outputs.any_changed == 'true'
         run: yarn run lint
 
+  docker-compose-template:
+    name: Docker Compose Template
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Check changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v45
+        with:
+          files: |
+            docker/generate_docker_compose
+            docker/.env.example
+            docker/docker-compose-template.yaml
+            docker/docker-compose.yaml
+
+      - name: Generate Docker Compose
+        if: steps.changed-files.outputs.any_changed == 'true'
+        run: |
+          cd docker
+          ./generate_docker_compose
+
+      - name: Check for changes
+        if: steps.changed-files.outputs.any_changed == 'true'
+        run: git diff --exit-code
 
   superlinter:
     name: SuperLinter

diff --git a/api/configs/middleware/vdb/milvus_config.py b/api/configs/middleware/vdb/milvus_config.py
@@ -33,3 +33,9 @@ class MilvusConfig(BaseSettings):
         description="Name of the Milvus database to connect to (default is 'default')",
         default="default",
     )
+
+    MILVUS_ENABLE_HYBRID_SEARCH: bool = Field(
+        description="Enable hybrid search features (requires Milvus >= 2.5.0). Set to false for compatibility with "
+        "older versions",
+        default=True,
+    )
diff --git a/api/configs/packaging/__init__.py b/api/configs/packaging/__init__.py
@@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings):
 
     CURRENT_VERSION: str = Field(
         description="Dify version",
-        default="0.15.0",
+        default="0.15.1",
     )
 
     COMMIT_SHA: str = Field(

diff --git a/api/controllers/console/app/audio.py b/api/controllers/console/app/audio.py
@@ -22,7 +22,7 @@
 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
 from core.model_runtime.errors.invoke import InvokeError
 from libs.login import login_required
-from models.model import AppMode
+from models import App, AppMode
 from services.audio_service import AudioService
 from services.errors.audio import (
     AudioTooLargeServiceError,
@@ -79,7 +79,7 @@ class ChatMessageTextApi(Resource):
     @login_required
     @account_initialization_required
     @get_app_model
-    def post(self, app_model):
+    def post(self, app_model: App):
         from werkzeug.exceptions import InternalServerError
 
         try:
@@ -98,9 +98,13 @@ def post(self, app_model):
                 and app_model.workflow.features_dict
             ):
                 text_to_speech = app_model.workflow.features_dict.get("text_to_speech")
+                if text_to_speech is None:
+                    raise ValueError("TTS is not enabled")
                 voice = args.get("voice") or text_to_speech.get("voice")
             else:
                 try:
+                    if app_model.app_model_config is None:
+                        raise ValueError("AppModelConfig not found")
                     voice = args.get("voice") or app_model.app_model_config.text_to_speech_dict.get("voice")
                 except Exception:
                     voice = None

diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py
@@ -52,12 +52,12 @@ def get(self):
         # provider = request.args.get("provider", default="vendor")
         search = request.args.get("keyword", default=None, type=str)
         tag_ids = request.args.getlist("tag_ids")
-
+        include_all = request.args.get("include_all", default="false").lower() == "true"
         if ids:
             datasets, total = DatasetService.get_datasets_by_ids(ids, current_user.current_tenant_id)
         else:
             datasets, total = DatasetService.get_datasets(
-                page, limit, current_user.current_tenant_id, current_user, search, tag_ids
+                page, limit, current_user.current_tenant_id, current_user, search, tag_ids, include_all
             )
 
         # check embedding setting
@@ -640,6 +640,7 @@ def get(self):
                 | VectorType.MYSCALE
                 | VectorType.ORACLE
                 | VectorType.ELASTICSEARCH
+                | VectorType.ELASTICSEARCH_JA
                 | VectorType.PGVECTOR
                 | VectorType.TIDB_ON_QDRANT
                 | VectorType.LINDORM
@@ -683,6 +684,7 @@ def get(self, vector_type):
                 | VectorType.MYSCALE
                 | VectorType.ORACLE
                 | VectorType.ELASTICSEARCH
+                | VectorType.ELASTICSEARCH_JA
                 | VectorType.COUCHBASE
                 | VectorType.PGVECTOR
                 | VectorType.LINDORM

diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py
@@ -257,7 +257,8 @@ def post(self, dataset_id):
         parser.add_argument("original_document_id", type=str, required=False, location="json")
         parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
         parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
-
+        parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
+        parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
         parser.add_argument(
             "doc_language", type=str, default="English", required=False, nullable=False, location="json"
         )

diff --git a/api/controllers/console/datasets/datasets_segments.py b/api/controllers/console/datasets/datasets_segments.py
@@ -368,9 +368,9 @@ def post(self, dataset_id, document_id):
             result = []
             for index, row in df.iterrows():
                 if document.doc_form == "qa_model":
-                    data = {"content": row[0], "answer": row[1]}
+                    data = {"content": row.iloc[0], "answer": row.iloc[1]}
                 else:
-                    data = {"content": row[0]}
+                    data = {"content": row.iloc[0]}
                 result.append(data)
             if len(result) == 0:
                 raise ValueError("The CSV file is empty.")

diff --git a/api/controllers/console/explore/conversation.py b/api/controllers/console/explore/conversation.py
@@ -32,7 +32,7 @@ def get(self, installed_app):
 
         pinned = None
         if "pinned" in args and args["pinned"] is not None:
-            pinned = True if args["pinned"] == "true" else False
+            pinned = args["pinned"] == "true"
 
         try:
             with Session(db.engine) as session:

diff --git a/api/controllers/service_api/__init__.py b/api/controllers/service_api/__init__.py
@@ -7,4 +7,4 @@
 
 from . import index
 from .app import app, audio, completion, conversation, file, message, workflow
-from .dataset import dataset, document, hit_testing, segment
+from .dataset import dataset, document, hit_testing, segment, upload_file
diff --git a/api/controllers/service_api/dataset/dataset.py b/api/controllers/service_api/dataset/dataset.py
@@ -31,8 +31,11 @@ def get(self, tenant_id):
         # provider = request.args.get("provider", default="vendor")
         search = request.args.get("keyword", default=None, type=str)
         tag_ids = request.args.getlist("tag_ids")
+        include_all = request.args.get("include_all", default="false").lower() == "true"
 
-        datasets, total = DatasetService.get_datasets(page, limit, tenant_id, current_user, search, tag_ids)
+        datasets, total = DatasetService.get_datasets(
+            page, limit, tenant_id, current_user, search, tag_ids, include_all
+        )
         # check embedding setting
         provider_manager = ProviderManager()
         configurations = provider_manager.get_configurations(tenant_id=current_user.current_tenant_id)

diff --git a/api/controllers/service_api/dataset/upload_file.py b/api/controllers/service_api/dataset/upload_file.py
@@ -0,0 +1,54 @@
+from werkzeug.exceptions import NotFound
+
+from controllers.service_api import api
+from controllers.service_api.wraps import (
+    DatasetApiResource,
+)
+from core.file import helpers as file_helpers
+from extensions.ext_database import db
+from models.dataset import Dataset
+from models.model import UploadFile
+from services.dataset_service import DocumentService
+
+
+class UploadFileApi(DatasetApiResource):
+    def get(self, tenant_id, dataset_id, document_id):
+        """Get upload file."""
+        # check dataset
+        dataset_id = str(dataset_id)
+        tenant_id = str(tenant_id)
+        dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first()
+        if not dataset:
+            raise NotFound("Dataset not found.")
+        # check document
+        document_id = str(document_id)
+        document = DocumentService.get_document(dataset.id, document_id)
+        if not document:
+            raise NotFound("Document not found.")
+        # check upload file
+        if document.data_source_type != "upload_file":
+            raise ValueError(f"Document data source type ({document.data_source_type}) is not upload_file.")
+        data_source_info = document.data_source_info_dict
+        if data_source_info and "upload_file_id" in data_source_info:
+            file_id = data_source_info["upload_file_id"]
+            upload_file = db.session.query(UploadFile).filter(UploadFile.id == file_id).first()
+            if not upload_file:
+                raise NotFound("UploadFile not found.")
+        else:
+            raise ValueError("Upload file id not found in document data source info.")
+
+        url = file_helpers.get_signed_file_url(upload_file_id=upload_file.id)
+        return {
+            "id": upload_file.id,
+            "name": upload_file.name,
+            "size": upload_file.size,
+            "extension": upload_file.extension,
+            "url": url,
+            "download_url": f"{url}&as_attachment=true",
+            "mime_type": upload_file.mime_type,
+            "created_by": upload_file.created_by,
+            "created_at": upload_file.created_at.timestamp(),
+        }, 200
+
+
+api.add_resource(UploadFileApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/upload-file")
diff --git a/api/controllers/service_api/wraps.py b/api/controllers/service_api/wraps.py
@@ -236,7 +236,7 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str]
             tenant_id=app_model.tenant_id,
             app_id=app_model.id,
             type="service_api",
-            is_anonymous=True if user_id == "DEFAULT-USER" else False,
+            is_anonymous=user_id == "DEFAULT-USER",
             session_id=user_id,
         )
         db.session.add(end_user)

diff --git a/api/controllers/web/conversation.py b/api/controllers/web/conversation.py
@@ -39,7 +39,7 @@ def get(self, app_model, end_user):
 
         pinned = None
         if "pinned" in args and args["pinned"] is not None:
-            pinned = True if args["pinned"] == "true" else False
+            pinned = args["pinned"] == "true"
 
         try:
             with Session(db.engine) as session:

diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py
@@ -530,7 +530,6 @@ def _load(
         # chunk nodes by chunk size
         indexing_start_at = time.perf_counter()
         tokens = 0
-        chunk_size = 10
         if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX:
             # create keyword index
             create_keyword_thread = threading.Thread(
@@ -539,11 +538,22 @@ def _load(
             )
             create_keyword_thread.start()
 
+        max_workers = 10
         if dataset.indexing_technique == "high_quality":
-            with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                 futures = []
-                for i in range(0, len(documents), chunk_size):
-                    chunk_documents = documents[i : i + chunk_size]
+
+                # Distribute documents into multiple groups based on the hash values of page_content
+                # This is done to prevent multiple threads from processing the same document,
+                # Thereby avoiding potential database insertion deadlocks
+                document_groups: list[list[Document]] = [[] for _ in range(max_workers)]
+                for document in documents:
+                    hash = helper.generate_text_hash(document.page_content)
+                    group_index = int(hash, 16) % max_workers
+                    document_groups[group_index].append(document)
+                for chunk_documents in document_groups:
+                    if len(chunk_documents) == 0:
+                        continue
                     futures.append(
                         executor.submit(
                             self._process_chunk,

diff --git a/api/core/model_runtime/model_providers/__base/tokenizers/gpt2_tokenzier.py b/api/core/model_runtime/model_providers/__base/tokenizers/gpt2_tokenzier.py
@@ -1,7 +1,8 @@
+import logging
 from threading import Lock
 from typing import Any
 
-import tiktoken
+logger = logging.getLogger(__name__)
 
 _tokenizer: Any = None
 _lock = Lock()
@@ -33,9 +34,18 @@ def get_encoder() -> Any:
             if _tokenizer is None:
                 # Try to use tiktoken to get the tokenizer because it is faster
                 #
-                _tokenizer = tiktoken.get_encoding("gpt2")
-                # base_path = abspath(__file__)
-                # gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
-                # _tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
+                try:
+                    import tiktoken
+
+                    _tokenizer = tiktoken.get_encoding("gpt2")
+                except Exception:
+                    from os.path import abspath, dirname, join
+
+                    from transformers import GPT2Tokenizer as TransformerGPT2Tokenizer  # type: ignore
+
+                    base_path = abspath(__file__)
+                    gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
+                    _tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
+                    logger.info("Fallback to Transformers' GPT-2 tokenizer from tiktoken")
 
             return _tokenizer
diff --git a/api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py b/api/core/model_runtime/model_providers/openai_api_compatible/llm/llm.py
@@ -377,10 +377,7 @@ def _generate(
                 for tool in tools:
                     formatted_tools.append(helper.dump_model(PromptMessageFunction(function=tool)))
 
-                if prompt_messages[-1].role.value == "tool":
-                    data["tools"] = None
-                else:
-                    data["tools"] = formatted_tools
+                data["tools"] = formatted_tools
 
         if stop:
             data["stop"] = stop

diff --git a/api/core/model_runtime/model_providers/openrouter/llm/claude-3-5-sonnet.yaml b/api/core/model_runtime/model_providers/openrouter/llm/claude-3-5-sonnet.yaml
@@ -7,6 +7,7 @@ features:
   - vision
   - tool-call
   - stream-tool-call
+  - document
 model_properties:
   mode: chat
   context_size: 200000

diff --git a/api/core/model_runtime/schema_validators/common_validator.py b/api/core/model_runtime/schema_validators/common_validator.py
@@ -87,6 +87,6 @@ def _validate_credential_form_schema(
             if value.lower() not in {"true", "false"}:
                 raise ValueError(f"Variable {credential_form_schema.variable} should be true or false")
 
-            value = True if value.lower() == "true" else False
+            value = value.lower() == "true"
 
         return value
diff --git a/api/core/ops/entities/config_entity.py b/api/core/ops/entities/config_entity.py
@@ -6,6 +6,7 @@
 class TracingProviderEnum(Enum):
     LANGFUSE = "langfuse"
     LANGSMITH = "langsmith"
+    OPIK = "opik"
 
 
 class BaseTracingConfig(BaseModel):
@@ -56,5 +57,36 @@ def set_value(cls, v, info: ValidationInfo):
         return v
 
 
+class OpikConfig(BaseTracingConfig):
+    """
+    Model class for Opik tracing config.
+    """
+
+    api_key: str | None = None
+    project: str | None = None
+    workspace: str | None = None
+    url: str = "https://www.comet.com/opik/api/"
+
+    @field_validator("project")
+    @classmethod
+    def project_validator(cls, v, info: ValidationInfo):
+        if v is None or v == "":
+            v = "Default Project"
+
+        return v
+
+    @field_validator("url")
+    @classmethod
+    def url_validator(cls, v, info: ValidationInfo):
+        if v is None or v == "":
+            v = "https://www.comet.com/opik/api/"
+        if not v.startswith(("https://", "http://")):
+            raise ValueError("url must start with https:// or http://")
+        if not v.endswith("/api/"):
+            raise ValueError("url should ends with /api/")
+
+        return v
+
+
 OPS_FILE_PATH = "ops_trace/"
 OPS_TRACE_FAILED_KEY = "FAILED_OPS_TRACE"
diff --git a/api/core/ops/opik_trace/__init__.py b/api/core/ops/opik_trace/__init__.py