From d2586278d6dc0859b93326029e87c1af68bba851 Mon Sep 17 00:00:00 2001 From: Hiroshi Fujita Date: Wed, 8 Jan 2025 13:35:41 +0900 Subject: [PATCH] Feat elasticsearch japanese (#12194) --- api/controllers/console/datasets/datasets.py | 2 + .../elasticsearch/elasticsearch_ja_vector.py | 104 ++++++++++++++++++ api/core/rag/datasource/vdb/vector_factory.py | 6 + api/core/rag/datasource/vdb/vector_type.py | 1 + docker/.env.example | 4 +- docker/docker-compose.yaml | 10 +- docker/elasticsearch/docker-entrypoint.sh | 25 +++++ 7 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py create mode 100755 docker/elasticsearch/docker-entrypoint.sh diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index 0c0d2e20035b43..2da45a7bb67bc6 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -640,6 +640,7 @@ def get(self): | VectorType.MYSCALE | VectorType.ORACLE | VectorType.ELASTICSEARCH + | VectorType.ELASTICSEARCH_JA | VectorType.PGVECTOR | VectorType.TIDB_ON_QDRANT | VectorType.LINDORM @@ -683,6 +684,7 @@ def get(self, vector_type): | VectorType.MYSCALE | VectorType.ORACLE | VectorType.ELASTICSEARCH + | VectorType.ELASTICSEARCH_JA | VectorType.COUCHBASE | VectorType.PGVECTOR | VectorType.LINDORM diff --git a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py new file mode 100644 index 00000000000000..27575197faccfe --- /dev/null +++ b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py @@ -0,0 +1,104 @@ +import json +import logging +from typing import Any, Optional + +from flask import current_app + +from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ( + ElasticSearchConfig, + ElasticSearchVector, + ElasticSearchVectorFactory, +) +from core.rag.datasource.vdb.field import Field +from core.rag.datasource.vdb.vector_type import VectorType +from core.rag.embedding.embedding_base import Embeddings +from extensions.ext_redis import redis_client +from models.dataset import Dataset + +logger = logging.getLogger(__name__) + + +class ElasticSearchJaVector(ElasticSearchVector): + def create_collection( + self, + embeddings: list[list[float]], + metadatas: Optional[list[dict[Any, Any]]] = None, + index_params: Optional[dict] = None, + ): + lock_name = f"vector_indexing_lock_{self._collection_name}" + with redis_client.lock(lock_name, timeout=20): + collection_exist_cache_key = f"vector_indexing_{self._collection_name}" + if redis_client.get(collection_exist_cache_key): + logger.info(f"Collection {self._collection_name} already exists.") + return + + if not self._client.indices.exists(index=self._collection_name): + dim = len(embeddings[0]) + settings = { + "analysis": { + "analyzer": { + "ja_analyzer": { + "type": "custom", + "char_filter": [ + "icu_normalizer", + "kuromoji_iteration_mark", + ], + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "kuromoji_baseform", + "kuromoji_part_of_speech", + "ja_stop", + "kuromoji_number", + "kuromoji_stemmer", + ], + } + } + } + } + mappings = { + "properties": { + Field.CONTENT_KEY.value: { + "type": "text", + "analyzer": "ja_analyzer", + "search_analyzer": "ja_analyzer", + }, + Field.VECTOR.value: { # Make sure the dimension is correct here + "type": "dense_vector", + "dims": dim, + "index": True, + "similarity": "cosine", + }, + Field.METADATA_KEY.value: { + "type": "object", + "properties": { + "doc_id": {"type": "keyword"} # Map doc_id to keyword type + }, + }, + } + } + self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings) + + redis_client.set(collection_exist_cache_key, 1, ex=3600) + + +class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory): + def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector: + if dataset.index_struct_dict: + class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] + collection_name = class_prefix + else: + dataset_id = dataset.id + collection_name = Dataset.gen_collection_name_by_id(dataset_id) + dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name)) + + config = current_app.config + return ElasticSearchJaVector( + index_name=collection_name, + config=ElasticSearchConfig( + host=config.get("ELASTICSEARCH_HOST", "localhost"), + port=config.get("ELASTICSEARCH_PORT", 9200), + username=config.get("ELASTICSEARCH_USERNAME", ""), + password=config.get("ELASTICSEARCH_PASSWORD", ""), + ), + attributes=[], + ) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 523fa80f124b0c..bdc40e29c7e838 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -90,6 +90,12 @@ def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]: from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory return ElasticSearchVectorFactory + case VectorType.ELASTICSEARCH_JA: + from core.rag.datasource.vdb.elasticsearch.elasticsearch_ja_vector import ( + ElasticSearchJaVectorFactory, + ) + + return ElasticSearchJaVectorFactory case VectorType.TIDB_VECTOR: from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory diff --git a/api/core/rag/datasource/vdb/vector_type.py b/api/core/rag/datasource/vdb/vector_type.py index 05183c03717fa3..e73411aa0d38a9 100644 --- a/api/core/rag/datasource/vdb/vector_type.py +++ b/api/core/rag/datasource/vdb/vector_type.py @@ -16,6 +16,7 @@ class VectorType(StrEnum): TENCENT = "tencent" ORACLE = "oracle" ELASTICSEARCH = "elasticsearch" + ELASTICSEARCH_JA = "elasticsearch-ja" LINDORM = "lindorm" COUCHBASE = "couchbase" BAIDU = "baidu" diff --git a/docker/.env.example b/docker/.env.example index 7c5447ef5bbf17..85277027b663c2 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -383,7 +383,7 @@ SUPABASE_URL=your-server-url # ------------------------------ # The type of vector store to use. -# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`. +# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`. VECTOR_STORE=weaviate # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. @@ -512,7 +512,7 @@ TENCENT_VECTOR_DB_SHARD=1 TENCENT_VECTOR_DB_REPLICAS=2 # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch` -ELASTICSEARCH_HOST=0.0.0.0 +ELASTICSEARCH_HOST=elasticsearch ELASTICSEARCH_PORT=9200 ELASTICSEARCH_USERNAME=elastic ELASTICSEARCH_PASSWORD=elastic diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index a24d4fbbd0d477..29e312de7f9c8b 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -883,20 +883,28 @@ services: container_name: elasticsearch profiles: - elasticsearch + - elasticsearch-ja restart: always volumes: + - ./elasticsearch/docker-entrypoint.sh:/docker-entrypoint-mount.sh - dify_es01_data:/usr/share/elasticsearch/data environment: ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic} + VECTOR_STORE: ${VECTOR_STORE:-} cluster.name: dify-es-cluster node.name: dify-es0 discovery.type: single-node - xpack.license.self_generated.type: trial + xpack.license.self_generated.type: basic xpack.security.enabled: 'true' xpack.security.enrollment.enabled: 'false' xpack.security.http.ssl.enabled: 'false' ports: - ${ELASTICSEARCH_PORT:-9200}:9200 + deploy: + resources: + limits: + memory: 2g + entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ] healthcheck: test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ] interval: 30s diff --git a/docker/elasticsearch/docker-entrypoint.sh b/docker/elasticsearch/docker-entrypoint.sh new file mode 100755 index 00000000000000..6669aec5a921c2 --- /dev/null +++ b/docker/elasticsearch/docker-entrypoint.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +set -e + +if [ "${VECTOR_STORE}" = "elasticsearch-ja" ]; then + # Check if the ICU tokenizer plugin is installed + if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-icu; then + printf '%s\n' "Installing the ICU tokenizer plugin" + if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu; then + printf '%s\n' "Failed to install the ICU tokenizer plugin" + exit 1 + fi + fi + # Check if the Japanese language analyzer plugin is installed + if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-kuromoji; then + printf '%s\n' "Installing the Japanese language analyzer plugin" + if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji; then + printf '%s\n' "Failed to install the Japanese language analyzer plugin" + exit 1 + fi + fi +fi + +# Run the original entrypoint script +exec /bin/tini -- /usr/local/bin/docker-entrypoint.sh