Skip to content

Commit

Permalink
Merge commit '6e0fb055d18969eb923e719ad92ecac3a5c5d534'
Browse files Browse the repository at this point in the history
* commit '6e0fb055d18969eb923e719ad92ecac3a5c5d534': (42 commits)
  chore: bump version to 0.15.1 (langgenius#12690)
  feat: add table of contents to Knowledge API doc (langgenius#12688)
  [fix] support feature restore (langgenius#12563)
  api tool support multiple env url (langgenius#12249)
  Add new integration with Opik Tracking tool (langgenius#11501)
  fix: add type hints for App model and improve error handling in audio services (langgenius#12677)
  fix: Update variable handling in VariableAssignerNode and clean up app_dsl_service (langgenius#12672)
  Revert "Feat/new saas billing" (langgenius#12673)
  fix(workflow): fix answer node stream processing in conditional branches (langgenius#12510)
  fix: ruff with statements (langgenius#12578)
  fix: ruff check for True if ... else (langgenius#12576)
  chore: Adjust translations to align with Taiwanese Mandarin conventions (langgenius#12633)
  Fix pandas indexing method for knowledge base imports (langgenius#12637) (langgenius#12638)
  Feat/new saas billing (langgenius#12591)
  improve the readability of the function generate_api_key (langgenius#12552)
  chore: translate i18n files (langgenius#12543)
  Feat/add knowledge include all filter (langgenius#12537)
  fix: Add datasets list access control and fix datasets config display issue (langgenius#12533)
  fix: sum costs return error value on overview page (langgenius#12534)
  feat: show workflow running status (langgenius#12531)
  ...

# Conflicts:
#	api/poetry.lock
  • Loading branch information
Scorpion1221 committed Jan 14, 2025
2 parents 9a4ecc0 + 6e0fb05 commit 1597a07
Show file tree
Hide file tree
Showing 147 changed files with 2,930 additions and 744 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/style.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,33 @@ jobs:
if: steps.changed-files.outputs.any_changed == 'true'
run: yarn run lint

docker-compose-template:
name: Docker Compose Template
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Check changed files
id: changed-files
uses: tj-actions/changed-files@v45
with:
files: |
docker/generate_docker_compose
docker/.env.example
docker/docker-compose-template.yaml
docker/docker-compose.yaml
- name: Generate Docker Compose
if: steps.changed-files.outputs.any_changed == 'true'
run: |
cd docker
./generate_docker_compose
- name: Check for changes
if: steps.changed-files.outputs.any_changed == 'true'
run: git diff --exit-code

superlinter:
name: SuperLinter
Expand Down
6 changes: 6 additions & 0 deletions api/configs/middleware/vdb/milvus_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,9 @@ class MilvusConfig(BaseSettings):
description="Name of the Milvus database to connect to (default is 'default')",
default="default",
)

MILVUS_ENABLE_HYBRID_SEARCH: bool = Field(
description="Enable hybrid search features (requires Milvus >= 2.5.0). Set to false for compatibility with "
"older versions",
default=True,
)
2 changes: 1 addition & 1 deletion api/configs/packaging/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings):

CURRENT_VERSION: str = Field(
description="Dify version",
default="0.15.0",
default="0.15.1",
)

COMMIT_SHA: str = Field(
Expand Down
8 changes: 6 additions & 2 deletions api/controllers/console/app/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.model_runtime.errors.invoke import InvokeError
from libs.login import login_required
from models.model import AppMode
from models import App, AppMode
from services.audio_service import AudioService
from services.errors.audio import (
AudioTooLargeServiceError,
Expand Down Expand Up @@ -79,7 +79,7 @@ class ChatMessageTextApi(Resource):
@login_required
@account_initialization_required
@get_app_model
def post(self, app_model):
def post(self, app_model: App):
from werkzeug.exceptions import InternalServerError

try:
Expand All @@ -98,9 +98,13 @@ def post(self, app_model):
and app_model.workflow.features_dict
):
text_to_speech = app_model.workflow.features_dict.get("text_to_speech")
if text_to_speech is None:
raise ValueError("TTS is not enabled")
voice = args.get("voice") or text_to_speech.get("voice")
else:
try:
if app_model.app_model_config is None:
raise ValueError("AppModelConfig not found")
voice = args.get("voice") or app_model.app_model_config.text_to_speech_dict.get("voice")
except Exception:
voice = None
Expand Down
6 changes: 4 additions & 2 deletions api/controllers/console/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ def get(self):
# provider = request.args.get("provider", default="vendor")
search = request.args.get("keyword", default=None, type=str)
tag_ids = request.args.getlist("tag_ids")

include_all = request.args.get("include_all", default="false").lower() == "true"
if ids:
datasets, total = DatasetService.get_datasets_by_ids(ids, current_user.current_tenant_id)
else:
datasets, total = DatasetService.get_datasets(
page, limit, current_user.current_tenant_id, current_user, search, tag_ids
page, limit, current_user.current_tenant_id, current_user, search, tag_ids, include_all
)

# check embedding setting
Expand Down Expand Up @@ -640,6 +640,7 @@ def get(self):
| VectorType.MYSCALE
| VectorType.ORACLE
| VectorType.ELASTICSEARCH
| VectorType.ELASTICSEARCH_JA
| VectorType.PGVECTOR
| VectorType.TIDB_ON_QDRANT
| VectorType.LINDORM
Expand Down Expand Up @@ -683,6 +684,7 @@ def get(self, vector_type):
| VectorType.MYSCALE
| VectorType.ORACLE
| VectorType.ELASTICSEARCH
| VectorType.ELASTICSEARCH_JA
| VectorType.COUCHBASE
| VectorType.PGVECTOR
| VectorType.LINDORM
Expand Down
3 changes: 2 additions & 1 deletion api/controllers/console/datasets/datasets_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,8 @@ def post(self, dataset_id):
parser.add_argument("original_document_id", type=str, required=False, location="json")
parser.add_argument("doc_form", type=str, default="text_model", required=False, nullable=False, location="json")
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")

parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json")
parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json")
parser.add_argument(
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
)
Expand Down
4 changes: 2 additions & 2 deletions api/controllers/console/datasets/datasets_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,9 @@ def post(self, dataset_id, document_id):
result = []
for index, row in df.iterrows():
if document.doc_form == "qa_model":
data = {"content": row[0], "answer": row[1]}
data = {"content": row.iloc[0], "answer": row.iloc[1]}
else:
data = {"content": row[0]}
data = {"content": row.iloc[0]}
result.append(data)
if len(result) == 0:
raise ValueError("The CSV file is empty.")
Expand Down
2 changes: 1 addition & 1 deletion api/controllers/console/explore/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def get(self, installed_app):

pinned = None
if "pinned" in args and args["pinned"] is not None:
pinned = True if args["pinned"] == "true" else False
pinned = args["pinned"] == "true"

try:
with Session(db.engine) as session:
Expand Down
2 changes: 1 addition & 1 deletion api/controllers/service_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@

from . import index
from .app import app, audio, completion, conversation, file, message, workflow
from .dataset import dataset, document, hit_testing, segment
from .dataset import dataset, document, hit_testing, segment, upload_file
5 changes: 4 additions & 1 deletion api/controllers/service_api/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@ def get(self, tenant_id):
# provider = request.args.get("provider", default="vendor")
search = request.args.get("keyword", default=None, type=str)
tag_ids = request.args.getlist("tag_ids")
include_all = request.args.get("include_all", default="false").lower() == "true"

datasets, total = DatasetService.get_datasets(page, limit, tenant_id, current_user, search, tag_ids)
datasets, total = DatasetService.get_datasets(
page, limit, tenant_id, current_user, search, tag_ids, include_all
)
# check embedding setting
provider_manager = ProviderManager()
configurations = provider_manager.get_configurations(tenant_id=current_user.current_tenant_id)
Expand Down
54 changes: 54 additions & 0 deletions api/controllers/service_api/dataset/upload_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from werkzeug.exceptions import NotFound

from controllers.service_api import api
from controllers.service_api.wraps import (
DatasetApiResource,
)
from core.file import helpers as file_helpers
from extensions.ext_database import db
from models.dataset import Dataset
from models.model import UploadFile
from services.dataset_service import DocumentService


class UploadFileApi(DatasetApiResource):
def get(self, tenant_id, dataset_id, document_id):
"""Get upload file."""
# check dataset
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first()
if not dataset:
raise NotFound("Dataset not found.")
# check document
document_id = str(document_id)
document = DocumentService.get_document(dataset.id, document_id)
if not document:
raise NotFound("Document not found.")
# check upload file
if document.data_source_type != "upload_file":
raise ValueError(f"Document data source type ({document.data_source_type}) is not upload_file.")
data_source_info = document.data_source_info_dict
if data_source_info and "upload_file_id" in data_source_info:
file_id = data_source_info["upload_file_id"]
upload_file = db.session.query(UploadFile).filter(UploadFile.id == file_id).first()
if not upload_file:
raise NotFound("UploadFile not found.")
else:
raise ValueError("Upload file id not found in document data source info.")

url = file_helpers.get_signed_file_url(upload_file_id=upload_file.id)
return {
"id": upload_file.id,
"name": upload_file.name,
"size": upload_file.size,
"extension": upload_file.extension,
"url": url,
"download_url": f"{url}&as_attachment=true",
"mime_type": upload_file.mime_type,
"created_by": upload_file.created_by,
"created_at": upload_file.created_at.timestamp(),
}, 200


api.add_resource(UploadFileApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/upload-file")
2 changes: 1 addition & 1 deletion api/controllers/service_api/wraps.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str]
tenant_id=app_model.tenant_id,
app_id=app_model.id,
type="service_api",
is_anonymous=True if user_id == "DEFAULT-USER" else False,
is_anonymous=user_id == "DEFAULT-USER",
session_id=user_id,
)
db.session.add(end_user)
Expand Down
2 changes: 1 addition & 1 deletion api/controllers/web/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get(self, app_model, end_user):

pinned = None
if "pinned" in args and args["pinned"] is not None:
pinned = True if args["pinned"] == "true" else False
pinned = args["pinned"] == "true"

try:
with Session(db.engine) as session:
Expand Down
18 changes: 14 additions & 4 deletions api/core/indexing_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,6 @@ def _load(
# chunk nodes by chunk size
indexing_start_at = time.perf_counter()
tokens = 0
chunk_size = 10
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX:
# create keyword index
create_keyword_thread = threading.Thread(
Expand All @@ -539,11 +538,22 @@ def _load(
)
create_keyword_thread.start()

max_workers = 10
if dataset.indexing_technique == "high_quality":
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for i in range(0, len(documents), chunk_size):
chunk_documents = documents[i : i + chunk_size]

# Distribute documents into multiple groups based on the hash values of page_content
# This is done to prevent multiple threads from processing the same document,
# Thereby avoiding potential database insertion deadlocks
document_groups: list[list[Document]] = [[] for _ in range(max_workers)]
for document in documents:
hash = helper.generate_text_hash(document.page_content)
group_index = int(hash, 16) % max_workers
document_groups[group_index].append(document)
for chunk_documents in document_groups:
if len(chunk_documents) == 0:
continue
futures.append(
executor.submit(
self._process_chunk,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
from threading import Lock
from typing import Any

import tiktoken
logger = logging.getLogger(__name__)

_tokenizer: Any = None
_lock = Lock()
Expand Down Expand Up @@ -33,9 +34,18 @@ def get_encoder() -> Any:
if _tokenizer is None:
# Try to use tiktoken to get the tokenizer because it is faster
#
_tokenizer = tiktoken.get_encoding("gpt2")
# base_path = abspath(__file__)
# gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
# _tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
try:
import tiktoken

_tokenizer = tiktoken.get_encoding("gpt2")
except Exception:
from os.path import abspath, dirname, join

from transformers import GPT2Tokenizer as TransformerGPT2Tokenizer # type: ignore

base_path = abspath(__file__)
gpt2_tokenizer_path = join(dirname(base_path), "gpt2")
_tokenizer = TransformerGPT2Tokenizer.from_pretrained(gpt2_tokenizer_path)
logger.info("Fallback to Transformers' GPT-2 tokenizer from tiktoken")

return _tokenizer
Original file line number Diff line number Diff line change
Expand Up @@ -377,10 +377,7 @@ def _generate(
for tool in tools:
formatted_tools.append(helper.dump_model(PromptMessageFunction(function=tool)))

if prompt_messages[-1].role.value == "tool":
data["tools"] = None
else:
data["tools"] = formatted_tools
data["tools"] = formatted_tools

if stop:
data["stop"] = stop
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ features:
- vision
- tool-call
- stream-tool-call
- document
model_properties:
mode: chat
context_size: 200000
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,6 @@ def _validate_credential_form_schema(
if value.lower() not in {"true", "false"}:
raise ValueError(f"Variable {credential_form_schema.variable} should be true or false")

value = True if value.lower() == "true" else False
value = value.lower() == "true"

return value
32 changes: 32 additions & 0 deletions api/core/ops/entities/config_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
class TracingProviderEnum(Enum):
LANGFUSE = "langfuse"
LANGSMITH = "langsmith"
OPIK = "opik"


class BaseTracingConfig(BaseModel):
Expand Down Expand Up @@ -56,5 +57,36 @@ def set_value(cls, v, info: ValidationInfo):
return v


class OpikConfig(BaseTracingConfig):
"""
Model class for Opik tracing config.
"""

api_key: str | None = None
project: str | None = None
workspace: str | None = None
url: str = "https://www.comet.com/opik/api/"

@field_validator("project")
@classmethod
def project_validator(cls, v, info: ValidationInfo):
if v is None or v == "":
v = "Default Project"

return v

@field_validator("url")
@classmethod
def url_validator(cls, v, info: ValidationInfo):
if v is None or v == "":
v = "https://www.comet.com/opik/api/"
if not v.startswith(("https://", "http://")):
raise ValueError("url must start with https:// or http://")
if not v.endswith("/api/"):
raise ValueError("url should ends with /api/")

return v


OPS_FILE_PATH = "ops_trace/"
OPS_TRACE_FAILED_KEY = "FAILED_OPS_TRACE"
Empty file.
Loading

0 comments on commit 1597a07

Please sign in to comment.