[fix](unicode) fix 4 bytes unicode read and write bug (#292)

airborne12 · web-flow · commit 8494cba761d8 · 2025-03-06T21:59:32.000+08:00
* [fix](build) fix build for clucene-2.0 * [fix](unicode) fix 4 bytes unicode read and write bug (#289) * [fix](unicode) fix 4 bytes unicode read and write bug (#255) * [fix](unicode) fix 4 bytes unicode read and write bug * [fix](unicode) resolve truncation problem for Unicode code points above 0xFFFF using a compatible approach (#284) fix truncate problem and add write flag * [test](unicode) add more ut * fix ut
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -25,7 +25,7 @@ jobs:
       matrix:
         config:
           - name: macOS
-            os: macos-12
+            os: macos-15
             packages: >-
               'automake'
               'autoconf'
diff --git a/.github/workflows/clucene-ut.yml b/.github/workflows/clucene-ut.yml
@@ -22,6 +22,8 @@ on:
     branches:
     - clucene
     - clucene-2.0
+    - clucene-2.1
+    - clucene-3.0
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
@@ -78,7 +80,7 @@ jobs:
 
   run_clucene_ut_macos:
     name: CLucene UT (MacOS)
-    runs-on: macos-12
+    runs-on: macos-15
     steps:
     - name: "Checkout ${{ github.event.pull_request.number }} ${{ github.event.pull_request.head.sha }}"
       uses: actions/checkout@v4
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -194,6 +194,7 @@ INCLUDE_DIRECTORIES( ${_CL_BOOST_INCLUDE_PATH} )
 ADD_SUBDIRECTORY (src/ext)
 include(cmake/FindRoaring.cmake)
 find_package(Roaring REQUIRED)
+INCLUDE_DIRECTORIES(${clucene_SOURCE_DIR}/src/ext/sse2neon)
 #TurboPFOR
 ADD_SUBDIRECTORY (src/ext/for)
 ADD_SUBDIRECTORY (src/shared)
diff --git a/src/core/CLucene/index/IndexWriter.cpp b/src/core/CLucene/index/IndexWriter.cpp
@@ -187,6 +187,16 @@ int32_t IndexWriter::getTermIndexInterval() {
     return termIndexInterval;
 }
 
+bool IndexWriter::getEnableCorrectTermWrite() {
+    ensureOpen();
+    return enableCorrectTermWrite;
+}
+
+void IndexWriter::setEnableCorrectTermWrite(bool enableCorrectTermWrite) {
+    ensureOpen();
+    this->enableCorrectTermWrite = enableCorrectTermWrite;
+}
+
 IndexWriter::IndexWriter(const char *path, Analyzer *a, bool create) : bOwnsDirectory(true) {
     init(FSDirectory::getDirectory(path, create), a, create, true, (IndexDeletionPolicy *) NULL, true);
 }
diff --git a/src/core/CLucene/index/IndexWriter.h b/src/core/CLucene/index/IndexWriter.h
@@ -259,7 +259,7 @@ class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
   int32_t minMergeDocs;
   int32_t maxMergeDocs;
   int32_t termIndexInterval;
-
+  bool enableCorrectTermWrite;
   int64_t writeLockTimeout;
   int64_t commitLockTimeout;
 
@@ -523,6 +523,9 @@ class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
    */
   int32_t getTermIndexInterval();
 
+  bool getEnableCorrectTermWrite();
+  void setEnableCorrectTermWrite(bool enableCorrectTermWrite);
+
   /**Determines the largest number of documents ever merged by addDocument().
    *  Small values (e.g., less than 10,000) are best for interactive indexing,
    *  as this limits the length of pauses while indexing to a few seconds.
diff --git a/src/core/CLucene/index/SDocumentWriter.cpp b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -1006,7 +1006,7 @@ void SDocumentsWriter<T>::writeSegment(std::vector<std::string> &flushedFiles) {
 
     auto *termsOut = _CLNEW STermInfosWriter<T>(directory, segmentName.c_str(), fieldInfos,
                                             writer->getTermIndexInterval());
-
+    termsOut->setEnableCorrectTermWrite(writer->getEnableCorrectTermWrite());
     IndexOutput *freqOut = directory->createOutput((segmentName + ".frq").c_str());
     // TODO:add options in field index
     IndexOutput *proxOut = nullptr;
diff --git a/src/core/CLucene/index/TermInfosWriter.cpp b/src/core/CLucene/index/TermInfosWriter.cpp
@@ -205,7 +205,11 @@ void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText, int3
 
         output->writeVInt(start);
         output->writeVInt(length);
-        output->writeSChars(newTermWStr.data() + start, length);
+        if (enableCorrectTermWrite_) {
+            output->writeSChars(newTermWStr.data() + start, length);
+        } else {
+            output->writeSCharsOrigin(newTermWStr.data() + start, length);
+        }
         output->writeVInt(fieldNumber);
     } else {
         int32_t start = 0;
@@ -218,13 +222,22 @@ void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText, int3
 
         int32_t length = termTextLength - start;
 
-        output->writeVInt(start);                    // write shared prefix length
-        output->writeVInt(length);                   // write delta length
-        output->writeSChars(termText + start, length);// write delta chars
-        output->writeVInt(fieldNumber);              // write field num
+        output->writeVInt(start);
+        output->writeVInt(length);
+        if (enableCorrectTermWrite_) {
+            output->writeSChars(termText + start, length);
+        } else {
+            output->writeSCharsOrigin(termText + start, length);
+        }
+        output->writeVInt(fieldNumber);
     }
 }
 
+template <typename T>
+void STermInfosWriter<T>::setEnableCorrectTermWrite(bool enableCorrectTermWrite) {
+    enableCorrectTermWrite_ = enableCorrectTermWrite;
+}
+
 template class STermInfosWriter<char>;
 template class STermInfosWriter<TCHAR>;
 
diff --git a/src/core/CLucene/index/_TermInfosWriter.h b/src/core/CLucene/index/_TermInfosWriter.h
@@ -61,9 +61,12 @@ class STermInfosWriter : LUCENE_BASE {
 
     void close();
 
+    void setEnableCorrectTermWrite(bool enableCorrectTermWrite);
+
 private:
     void initialise(CL_NS(store)::Directory *directory, const char *segment, int32_t interval, bool IsIndex);
     void writeTerm(int32_t fieldNumber, const T *termText, int32_t termTextLength);
+    bool enableCorrectTermWrite_ = true;
 };
 
 // This stores a monotonically increasing set of <Term, TermInfo> pairs in a
diff --git a/src/core/CLucene/store/IndexInput.cpp b/src/core/CLucene/store/IndexInput.cpp
@@ -129,28 +129,29 @@ CL_NS_USE(util)
       readBytes(b, len, offset);
   }
 
-  void IndexInput::readChars( TCHAR* buffer, const int32_t start, const int32_t len) {
+void IndexInput::readChars(TCHAR* buffer, const int32_t start, const int32_t len) {
     const int32_t end = start + len;
     TCHAR b;
     for (int32_t i = start; i < end; ++i) {
-      b = readByte();
-      if ((b & 0x80) == 0) {
-        b = (b & 0x7F);
-      } else if ((b & 0xE0) != 0xE0) {
-        b = (((b & 0x1F) << 6)
-          | (readByte() & 0x3F));
-      } else {
-		  b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6);
-		  b |= (readByte() & 0x3F);
-      }
-      buffer[i] = b;
-	}
-  }
-
-
-
-
-
+        b = readByte();
+        if ((b & 0x80) == 0) {
+            b = (b & 0x7F);
+        } else if (b >= 0x80 && b <= 0x84) {
+            // NOTE: This is not correct UTF-8 encoding, but it is what we are doing now.
+            // We must differ it from previous wrong encoding code, previous code will write 3bytes characters starts with 0xF0-0xFF for 4-byte characters.
+            // Which will mixed with the correct 4-byte characters with UTF-8 encoding.
+            // This is a temporary solution, we need to find a better way to handle this.
+            b = ((b & 0x07) << 18) | ((readByte() & 0x3F) << 12) | ((readByte() & 0x3F) << 6) |
+                (readByte() & 0x3F);
+        } else if ((b & 0xE0) != 0xE0) {
+            b = (((b & 0x1F) << 6) | (readByte() & 0x3F));
+        } else {
+            b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6);
+            b |= (readByte() & 0x3F);
+        }
+        buffer[i] = b;
+    }
+}
 
 BufferedIndexInput::BufferedIndexInput(int32_t _bufferSize):
 		buffer(NULL),
diff --git a/src/core/CLucene/store/IndexOutput.cpp b/src/core/CLucene/store/IndexOutput.cpp
@@ -158,8 +158,8 @@ CL_NS_DEF(store)
     writeChars(s, length);
   }
 
-  template <>
-  void IndexOutput::writeSChars(const TCHAR* s, const int32_t length){
+ template <>
+  void IndexOutput::writeSCharsOrigin(const TCHAR* s, const int32_t length){
       if ( length < 0 )
           _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value.");
 
@@ -179,6 +179,40 @@ CL_NS_DEF(store)
       }
   }
 
+  template <>
+  void IndexOutput::writeSChars(const TCHAR* s, const int32_t length) {
+      if (length < 0)
+          _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value.");
+
+      const int32_t end = length;
+      for (int32_t i = 0; i < end; ++i) {
+          auto code = (uint32_t)s[i];
+          if (code >= 0x00 && code <= 0x7F) {
+            writeByte((uint8_t)code);
+          } else if (code <= 0x7FF) {
+            writeByte((uint8_t)(0xC0 | (code >> 6)));
+            writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else if (code <= 0xFFFF) {
+            writeByte((uint8_t)(0xE0 | (code >> 12)));
+            writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+            writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else if (code <= 0x10FFFF) {
+            // NOTE: This is not correct UTF-8 encoding, but it is what we are doing now.
+            // We must differ it from previous wrong encoding code, previous code will write 3bytes characters starts with 0xF0-0xFF for 4-byte characters.
+            // Which will mixed with the correct 4-byte characters with UTF-8 encoding.
+            // This is a temporary solution, we need to find a better way to handle this.
+            writeByte((uint8_t)(0x80 | (code >> 18)));
+            writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F)));
+            writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+            writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else {
+            writeByte(0xEF);
+            writeByte(0xBF);
+            writeByte(0xBD);
+          }
+      }
+  }
+
   template <>
   void IndexOutput::writeSChars(const char* s, const int32_t length){
       if ( length < 0 )
@@ -187,26 +221,38 @@ CL_NS_DEF(store)
       writeBytes((const uint8_t*)s, length);
   }
 
-  void IndexOutput::writeChars(const TCHAR* s, const int32_t length){
-    if ( length < 0 )
-      _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value.");
-
-    const int32_t end = length;
-    for (int32_t i = 0; i < end; ++i) {
-        const int32_t code = (int32_t)s[i];
-        if (code >= 0x01 && code <= 0x7F)
-					writeByte((uint8_t)code);
-        else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
-					writeByte((uint8_t)(0xC0 | (code >> 6)));
-					writeByte((uint8_t)(0x80 | (code & 0x3F)));
-        } else {
-					writeByte((uint8_t)(0xE0 | (((uint32_t)code) >> 12))); //unsigned shift
-					writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
-					writeByte((uint8_t)(0x80 | (code & 0x3F)));
-        }
-    }
-  }
+  void IndexOutput::writeChars(const TCHAR* s, const int32_t length) {
+      if (length < 0)
+          _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value.");
 
+      const int32_t end = length;
+      for (int32_t i = 0; i < end; ++i) {
+          auto code = (uint32_t)s[i];
+          if (code >= 0x00 && code <= 0x7F) {
+            writeByte((uint8_t)code);
+          } else if (code <= 0x7FF) {
+            writeByte((uint8_t)(0xC0 | (code >> 6)));
+            writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else if (code <= 0xFFFF) {
+            writeByte((uint8_t)(0xE0 | (code >> 12)));
+            writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+            writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else if (code <= 0x10FFFF) {
+            // NOTE: This is not correct UTF-8 encoding, but it is what we are doing now.
+            // We must differ it from previous wrong encoding code, previous code will write 3bytes characters starts with 0xF0-0xFF for 4-byte characters.
+            // Which will mixed with the correct 4-byte characters with UTF-8 encoding.
+            // This is a temporary solution, we need to find a better way to handle this.
+            writeByte((uint8_t)(0x80 | (code >> 18)));
+            writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F)));
+            writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+            writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else {
+            writeByte(0xEF);
+            writeByte(0xBF);
+            writeByte(0xBD);
+          }
+      }
+  }
 
   int64_t BufferedIndexOutput::getFilePointer() const{
     return bufferStart + bufferPosition;
diff --git a/src/core/CLucene/store/IndexOutput.h b/src/core/CLucene/store/IndexOutput.h
@@ -84,6 +84,9 @@ class CLUCENE_EXPORT IndexOutput:LUCENE_BASE{
     template<typename T>
     void writeSChars(const T* s, int32_t length);
 
+    template<typename T>
+    void writeSCharsOrigin(const T* s, int32_t length);
+
     /** Closes this stream to further operations. */
 	virtual void close() = 0;
 
diff --git a/src/ext/sse2neon/sse2neon.h b/src/ext/sse2neon/sse2neon.h
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
diff --git a/src/test/store/TestUTF8Chars.cpp b/src/test/store/TestUTF8Chars.cpp
diff --git a/src/test/test.h b/src/test/test.h
diff --git a/src/test/tests.cpp b/src/test/tests.cpp