From d2586278d6dc0859b93326029e87c1af68bba851 Mon Sep 17 00:00:00 2001
From: Hiroshi Fujita <fujita-h@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:35:41 +0900
Subject: [PATCH] Feat elasticsearch japanese (#12194)

---
 api/controllers/console/datasets/datasets.py  |   2 +
 .../elasticsearch/elasticsearch_ja_vector.py  | 104 ++++++++++++++++++
 api/core/rag/datasource/vdb/vector_factory.py |   6 +
 api/core/rag/datasource/vdb/vector_type.py    |   1 +
 docker/.env.example                           |   4 +-
 docker/docker-compose.yaml                    |  10 +-
 docker/elasticsearch/docker-entrypoint.sh     |  25 +++++
 7 files changed, 149 insertions(+), 3 deletions(-)
 create mode 100644 api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py
 create mode 100755 docker/elasticsearch/docker-entrypoint.sh

diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py
index 0c0d2e20035b43..2da45a7bb67bc6 100644
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@@ -640,6 +640,7 @@ def get(self):
                 | VectorType.MYSCALE
                 | VectorType.ORACLE
                 | VectorType.ELASTICSEARCH
+                | VectorType.ELASTICSEARCH_JA
                 | VectorType.PGVECTOR
                 | VectorType.TIDB_ON_QDRANT
                 | VectorType.LINDORM
@@ -683,6 +684,7 @@ def get(self, vector_type):
                 | VectorType.MYSCALE
                 | VectorType.ORACLE
                 | VectorType.ELASTICSEARCH
+                | VectorType.ELASTICSEARCH_JA
                 | VectorType.COUCHBASE
                 | VectorType.PGVECTOR
                 | VectorType.LINDORM
diff --git a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py
new file mode 100644
index 00000000000000..27575197faccfe
--- /dev/null
+++ b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py
@@ -0,0 +1,104 @@
+import json
+import logging
+from typing import Any, Optional
+
+from flask import current_app
+
+from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import (
+    ElasticSearchConfig,
+    ElasticSearchVector,
+    ElasticSearchVectorFactory,
+)
+from core.rag.datasource.vdb.field import Field
+from core.rag.datasource.vdb.vector_type import VectorType
+from core.rag.embedding.embedding_base import Embeddings
+from extensions.ext_redis import redis_client
+from models.dataset import Dataset
+
+logger = logging.getLogger(__name__)
+
+
+class ElasticSearchJaVector(ElasticSearchVector):
+    def create_collection(
+        self,
+        embeddings: list[list[float]],
+        metadatas: Optional[list[dict[Any, Any]]] = None,
+        index_params: Optional[dict] = None,
+    ):
+        lock_name = f"vector_indexing_lock_{self._collection_name}"
+        with redis_client.lock(lock_name, timeout=20):
+            collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
+            if redis_client.get(collection_exist_cache_key):
+                logger.info(f"Collection {self._collection_name} already exists.")
+                return
+
+            if not self._client.indices.exists(index=self._collection_name):
+                dim = len(embeddings[0])
+                settings = {
+                    "analysis": {
+                        "analyzer": {
+                            "ja_analyzer": {
+                                "type": "custom",
+                                "char_filter": [
+                                    "icu_normalizer",
+                                    "kuromoji_iteration_mark",
+                                ],
+                                "tokenizer": "kuromoji_tokenizer",
+                                "filter": [
+                                    "kuromoji_baseform",
+                                    "kuromoji_part_of_speech",
+                                    "ja_stop",
+                                    "kuromoji_number",
+                                    "kuromoji_stemmer",
+                                ],
+                            }
+                        }
+                    }
+                }
+                mappings = {
+                    "properties": {
+                        Field.CONTENT_KEY.value: {
+                            "type": "text",
+                            "analyzer": "ja_analyzer",
+                            "search_analyzer": "ja_analyzer",
+                        },
+                        Field.VECTOR.value: {  # Make sure the dimension is correct here
+                            "type": "dense_vector",
+                            "dims": dim,
+                            "index": True,
+                            "similarity": "cosine",
+                        },
+                        Field.METADATA_KEY.value: {
+                            "type": "object",
+                            "properties": {
+                                "doc_id": {"type": "keyword"}  # Map doc_id to keyword type
+                            },
+                        },
+                    }
+                }
+                self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings)
+
+            redis_client.set(collection_exist_cache_key, 1, ex=3600)
+
+
+class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory):
+    def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector:
+        if dataset.index_struct_dict:
+            class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
+            collection_name = class_prefix
+        else:
+            dataset_id = dataset.id
+            collection_name = Dataset.gen_collection_name_by_id(dataset_id)
+            dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name))
+
+        config = current_app.config
+        return ElasticSearchJaVector(
+            index_name=collection_name,
+            config=ElasticSearchConfig(
+                host=config.get("ELASTICSEARCH_HOST", "localhost"),
+                port=config.get("ELASTICSEARCH_PORT", 9200),
+                username=config.get("ELASTICSEARCH_USERNAME", ""),
+                password=config.get("ELASTICSEARCH_PASSWORD", ""),
+            ),
+            attributes=[],
+        )
diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py
index 523fa80f124b0c..bdc40e29c7e838 100644
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@@ -90,6 +90,12 @@ def get_vector_factory(vector_type: str) -> type[AbstractVectorFactory]:
                 from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory
 
                 return ElasticSearchVectorFactory
+            case VectorType.ELASTICSEARCH_JA:
+                from core.rag.datasource.vdb.elasticsearch.elasticsearch_ja_vector import (
+                    ElasticSearchJaVectorFactory,
+                )
+
+                return ElasticSearchJaVectorFactory
             case VectorType.TIDB_VECTOR:
                 from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory
 
diff --git a/api/core/rag/datasource/vdb/vector_type.py b/api/core/rag/datasource/vdb/vector_type.py
index 05183c03717fa3..e73411aa0d38a9 100644
--- a/api/core/rag/datasource/vdb/vector_type.py
+++ b/api/core/rag/datasource/vdb/vector_type.py
@@ -16,6 +16,7 @@ class VectorType(StrEnum):
     TENCENT = "tencent"
     ORACLE = "oracle"
     ELASTICSEARCH = "elasticsearch"
+    ELASTICSEARCH_JA = "elasticsearch-ja"
     LINDORM = "lindorm"
     COUCHBASE = "couchbase"
     BAIDU = "baidu"
diff --git a/docker/.env.example b/docker/.env.example
index 7c5447ef5bbf17..85277027b663c2 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -383,7 +383,7 @@ SUPABASE_URL=your-server-url
 # ------------------------------
 
 # The type of vector store to use.
-# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
+# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
 VECTOR_STORE=weaviate
 
 # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`.
@@ -512,7 +512,7 @@ TENCENT_VECTOR_DB_SHARD=1
 TENCENT_VECTOR_DB_REPLICAS=2
 
 # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch`
-ELASTICSEARCH_HOST=0.0.0.0
+ELASTICSEARCH_HOST=elasticsearch
 ELASTICSEARCH_PORT=9200
 ELASTICSEARCH_USERNAME=elastic
 ELASTICSEARCH_PASSWORD=elastic
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index a24d4fbbd0d477..29e312de7f9c8b 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -883,20 +883,28 @@ services:
     container_name: elasticsearch
     profiles:
       - elasticsearch
+      - elasticsearch-ja
     restart: always
     volumes:
+      - ./elasticsearch/docker-entrypoint.sh:/docker-entrypoint-mount.sh
       - dify_es01_data:/usr/share/elasticsearch/data
     environment:
       ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic}
+      VECTOR_STORE: ${VECTOR_STORE:-}
       cluster.name: dify-es-cluster
       node.name: dify-es0
       discovery.type: single-node
-      xpack.license.self_generated.type: trial
+      xpack.license.self_generated.type: basic
       xpack.security.enabled: 'true'
       xpack.security.enrollment.enabled: 'false'
       xpack.security.http.ssl.enabled: 'false'
     ports:
       - ${ELASTICSEARCH_PORT:-9200}:9200
+    deploy:
+      resources:
+        limits:
+          memory: 2g
+    entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ]
     healthcheck:
       test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ]
       interval: 30s
diff --git a/docker/elasticsearch/docker-entrypoint.sh b/docker/elasticsearch/docker-entrypoint.sh
new file mode 100755
index 00000000000000..6669aec5a921c2
--- /dev/null
+++ b/docker/elasticsearch/docker-entrypoint.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -e
+
+if [ "${VECTOR_STORE}" = "elasticsearch-ja" ]; then
+    # Check if the ICU tokenizer plugin is installed
+    if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-icu; then
+        printf '%s\n' "Installing the ICU tokenizer plugin"
+        if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu; then
+            printf '%s\n' "Failed to install the ICU tokenizer plugin"
+            exit 1
+        fi
+    fi
+    # Check if the Japanese language analyzer plugin is installed
+    if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-kuromoji; then
+        printf '%s\n' "Installing the Japanese language analyzer plugin"
+        if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji; then
+            printf '%s\n' "Failed to install the Japanese language analyzer plugin"
+            exit 1
+        fi
+    fi
+fi
+
+# Run the original entrypoint script
+exec /bin/tini -- /usr/local/bin/docker-entrypoint.sh