Skip to content

Commit 8494cba

Browse files
authored
[fix](unicode) fix 4 bytes unicode read and write bug (#292)
* [fix](build) fix build for clucene-2.0 * [fix](unicode) fix 4 bytes unicode read and write bug (#289) * [fix](unicode) fix 4 bytes unicode read and write bug (#255) * [fix](unicode) fix 4 bytes unicode read and write bug * [fix](unicode) resolve truncation problem for Unicode code points above 0xFFFF using a compatible approach (#284) fix truncate problem and add write flag * [test](unicode) add more ut * fix ut
1 parent 3526de7 commit 8494cba

16 files changed

+11330
-49
lines changed

.github/workflows/build.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
matrix:
2626
config:
2727
- name: macOS
28-
os: macos-12
28+
os: macos-15
2929
packages: >-
3030
'automake'
3131
'autoconf'

.github/workflows/clucene-ut.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ on:
2222
branches:
2323
- clucene
2424
- clucene-2.0
25+
- clucene-2.1
26+
- clucene-3.0
2527

2628
concurrency:
2729
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
@@ -78,7 +80,7 @@ jobs:
7880

7981
run_clucene_ut_macos:
8082
name: CLucene UT (MacOS)
81-
runs-on: macos-12
83+
runs-on: macos-15
8284
steps:
8385
- name: "Checkout ${{ github.event.pull_request.number }} ${{ github.event.pull_request.head.sha }}"
8486
uses: actions/checkout@v4

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ INCLUDE_DIRECTORIES( ${_CL_BOOST_INCLUDE_PATH} )
194194
ADD_SUBDIRECTORY (src/ext)
195195
include(cmake/FindRoaring.cmake)
196196
find_package(Roaring REQUIRED)
197+
INCLUDE_DIRECTORIES(${clucene_SOURCE_DIR}/src/ext/sse2neon)
197198
#TurboPFOR
198199
ADD_SUBDIRECTORY (src/ext/for)
199200
ADD_SUBDIRECTORY (src/shared)

src/core/CLucene/index/IndexWriter.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,16 @@ int32_t IndexWriter::getTermIndexInterval() {
187187
return termIndexInterval;
188188
}
189189

190+
bool IndexWriter::getEnableCorrectTermWrite() {
191+
ensureOpen();
192+
return enableCorrectTermWrite;
193+
}
194+
195+
void IndexWriter::setEnableCorrectTermWrite(bool enableCorrectTermWrite) {
196+
ensureOpen();
197+
this->enableCorrectTermWrite = enableCorrectTermWrite;
198+
}
199+
190200
IndexWriter::IndexWriter(const char *path, Analyzer *a, bool create) : bOwnsDirectory(true) {
191201
init(FSDirectory::getDirectory(path, create), a, create, true, (IndexDeletionPolicy *) NULL, true);
192202
}

src/core/CLucene/index/IndexWriter.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
259259
int32_t minMergeDocs;
260260
int32_t maxMergeDocs;
261261
int32_t termIndexInterval;
262-
262+
bool enableCorrectTermWrite;
263263
int64_t writeLockTimeout;
264264
int64_t commitLockTimeout;
265265

@@ -523,6 +523,9 @@ class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
523523
*/
524524
int32_t getTermIndexInterval();
525525

526+
bool getEnableCorrectTermWrite();
527+
void setEnableCorrectTermWrite(bool enableCorrectTermWrite);
528+
526529
/**Determines the largest number of documents ever merged by addDocument().
527530
* Small values (e.g., less than 10,000) are best for interactive indexing,
528531
* as this limits the length of pauses while indexing to a few seconds.

src/core/CLucene/index/SDocumentWriter.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1006,7 +1006,7 @@ void SDocumentsWriter<T>::writeSegment(std::vector<std::string> &flushedFiles) {
10061006

10071007
auto *termsOut = _CLNEW STermInfosWriter<T>(directory, segmentName.c_str(), fieldInfos,
10081008
writer->getTermIndexInterval());
1009-
1009+
termsOut->setEnableCorrectTermWrite(writer->getEnableCorrectTermWrite());
10101010
IndexOutput *freqOut = directory->createOutput((segmentName + ".frq").c_str());
10111011
// TODO:add options in field index
10121012
IndexOutput *proxOut = nullptr;

src/core/CLucene/index/TermInfosWriter.cpp

+18-5
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,11 @@ void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText, int3
205205

206206
output->writeVInt(start);
207207
output->writeVInt(length);
208-
output->writeSChars(newTermWStr.data() + start, length);
208+
if (enableCorrectTermWrite_) {
209+
output->writeSChars(newTermWStr.data() + start, length);
210+
} else {
211+
output->writeSCharsOrigin(newTermWStr.data() + start, length);
212+
}
209213
output->writeVInt(fieldNumber);
210214
} else {
211215
int32_t start = 0;
@@ -218,13 +222,22 @@ void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText, int3
218222

219223
int32_t length = termTextLength - start;
220224

221-
output->writeVInt(start); // write shared prefix length
222-
output->writeVInt(length); // write delta length
223-
output->writeSChars(termText + start, length);// write delta chars
224-
output->writeVInt(fieldNumber); // write field num
225+
output->writeVInt(start);
226+
output->writeVInt(length);
227+
if (enableCorrectTermWrite_) {
228+
output->writeSChars(termText + start, length);
229+
} else {
230+
output->writeSCharsOrigin(termText + start, length);
231+
}
232+
output->writeVInt(fieldNumber);
225233
}
226234
}
227235

236+
template <typename T>
237+
void STermInfosWriter<T>::setEnableCorrectTermWrite(bool enableCorrectTermWrite) {
238+
enableCorrectTermWrite_ = enableCorrectTermWrite;
239+
}
240+
228241
template class STermInfosWriter<char>;
229242
template class STermInfosWriter<TCHAR>;
230243

src/core/CLucene/index/_TermInfosWriter.h

+3
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,12 @@ class STermInfosWriter : LUCENE_BASE {
6161

6262
void close();
6363

64+
void setEnableCorrectTermWrite(bool enableCorrectTermWrite);
65+
6466
private:
6567
void initialise(CL_NS(store)::Directory *directory, const char *segment, int32_t interval, bool IsIndex);
6668
void writeTerm(int32_t fieldNumber, const T *termText, int32_t termTextLength);
69+
bool enableCorrectTermWrite_ = true;
6770
};
6871

6972
// This stores a monotonically increasing set of <Term, TermInfo> pairs in a

src/core/CLucene/store/IndexInput.cpp

+20-19
Original file line numberDiff line numberDiff line change
@@ -129,28 +129,29 @@ CL_NS_USE(util)
129129
readBytes(b, len, offset);
130130
}
131131

132-
void IndexInput::readChars( TCHAR* buffer, const int32_t start, const int32_t len) {
132+
void IndexInput::readChars(TCHAR* buffer, const int32_t start, const int32_t len) {
133133
const int32_t end = start + len;
134134
TCHAR b;
135135
for (int32_t i = start; i < end; ++i) {
136-
b = readByte();
137-
if ((b & 0x80) == 0) {
138-
b = (b & 0x7F);
139-
} else if ((b & 0xE0) != 0xE0) {
140-
b = (((b & 0x1F) << 6)
141-
| (readByte() & 0x3F));
142-
} else {
143-
b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6);
144-
b |= (readByte() & 0x3F);
145-
}
146-
buffer[i] = b;
147-
}
148-
}
149-
150-
151-
152-
153-
136+
b = readByte();
137+
if ((b & 0x80) == 0) {
138+
b = (b & 0x7F);
139+
} else if (b >= 0x80 && b <= 0x84) {
140+
// NOTE: This is not correct UTF-8 encoding, but it is what we are doing now.
141+
// We must differ it from previous wrong encoding code, previous code will write 3bytes characters starts with 0xF0-0xFF for 4-byte characters.
142+
// Which will mixed with the correct 4-byte characters with UTF-8 encoding.
143+
// This is a temporary solution, we need to find a better way to handle this.
144+
b = ((b & 0x07) << 18) | ((readByte() & 0x3F) << 12) | ((readByte() & 0x3F) << 6) |
145+
(readByte() & 0x3F);
146+
} else if ((b & 0xE0) != 0xE0) {
147+
b = (((b & 0x1F) << 6) | (readByte() & 0x3F));
148+
} else {
149+
b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6);
150+
b |= (readByte() & 0x3F);
151+
}
152+
buffer[i] = b;
153+
}
154+
}
154155

155156
BufferedIndexInput::BufferedIndexInput(int32_t _bufferSize):
156157
buffer(NULL),

src/core/CLucene/store/IndexOutput.cpp

+67-21
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ CL_NS_DEF(store)
158158
writeChars(s, length);
159159
}
160160

161-
template <>
162-
void IndexOutput::writeSChars(const TCHAR* s, const int32_t length){
161+
template <>
162+
void IndexOutput::writeSCharsOrigin(const TCHAR* s, const int32_t length){
163163
if ( length < 0 )
164164
_CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value.");
165165

@@ -179,6 +179,40 @@ CL_NS_DEF(store)
179179
}
180180
}
181181

182+
template <>
183+
void IndexOutput::writeSChars(const TCHAR* s, const int32_t length) {
184+
if (length < 0)
185+
_CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value.");
186+
187+
const int32_t end = length;
188+
for (int32_t i = 0; i < end; ++i) {
189+
auto code = (uint32_t)s[i];
190+
if (code >= 0x00 && code <= 0x7F) {
191+
writeByte((uint8_t)code);
192+
} else if (code <= 0x7FF) {
193+
writeByte((uint8_t)(0xC0 | (code >> 6)));
194+
writeByte((uint8_t)(0x80 | (code & 0x3F)));
195+
} else if (code <= 0xFFFF) {
196+
writeByte((uint8_t)(0xE0 | (code >> 12)));
197+
writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
198+
writeByte((uint8_t)(0x80 | (code & 0x3F)));
199+
} else if (code <= 0x10FFFF) {
200+
// NOTE: This is not correct UTF-8 encoding, but it is what we are doing now.
201+
// We must differ it from previous wrong encoding code, previous code will write 3bytes characters starts with 0xF0-0xFF for 4-byte characters.
202+
// Which will mixed with the correct 4-byte characters with UTF-8 encoding.
203+
// This is a temporary solution, we need to find a better way to handle this.
204+
writeByte((uint8_t)(0x80 | (code >> 18)));
205+
writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F)));
206+
writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
207+
writeByte((uint8_t)(0x80 | (code & 0x3F)));
208+
} else {
209+
writeByte(0xEF);
210+
writeByte(0xBF);
211+
writeByte(0xBD);
212+
}
213+
}
214+
}
215+
182216
template <>
183217
void IndexOutput::writeSChars(const char* s, const int32_t length){
184218
if ( length < 0 )
@@ -187,26 +221,38 @@ CL_NS_DEF(store)
187221
writeBytes((const uint8_t*)s, length);
188222
}
189223

190-
void IndexOutput::writeChars(const TCHAR* s, const int32_t length){
191-
if ( length < 0 )
192-
_CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value.");
193-
194-
const int32_t end = length;
195-
for (int32_t i = 0; i < end; ++i) {
196-
const int32_t code = (int32_t)s[i];
197-
if (code >= 0x01 && code <= 0x7F)
198-
writeByte((uint8_t)code);
199-
else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
200-
writeByte((uint8_t)(0xC0 | (code >> 6)));
201-
writeByte((uint8_t)(0x80 | (code & 0x3F)));
202-
} else {
203-
writeByte((uint8_t)(0xE0 | (((uint32_t)code) >> 12))); //unsigned shift
204-
writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
205-
writeByte((uint8_t)(0x80 | (code & 0x3F)));
206-
}
207-
}
208-
}
224+
void IndexOutput::writeChars(const TCHAR* s, const int32_t length) {
225+
if (length < 0)
226+
_CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value.");
209227

228+
const int32_t end = length;
229+
for (int32_t i = 0; i < end; ++i) {
230+
auto code = (uint32_t)s[i];
231+
if (code >= 0x00 && code <= 0x7F) {
232+
writeByte((uint8_t)code);
233+
} else if (code <= 0x7FF) {
234+
writeByte((uint8_t)(0xC0 | (code >> 6)));
235+
writeByte((uint8_t)(0x80 | (code & 0x3F)));
236+
} else if (code <= 0xFFFF) {
237+
writeByte((uint8_t)(0xE0 | (code >> 12)));
238+
writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
239+
writeByte((uint8_t)(0x80 | (code & 0x3F)));
240+
} else if (code <= 0x10FFFF) {
241+
// NOTE: This is not correct UTF-8 encoding, but it is what we are doing now.
242+
// We must differ it from previous wrong encoding code, previous code will write 3bytes characters starts with 0xF0-0xFF for 4-byte characters.
243+
// Which will mixed with the correct 4-byte characters with UTF-8 encoding.
244+
// This is a temporary solution, we need to find a better way to handle this.
245+
writeByte((uint8_t)(0x80 | (code >> 18)));
246+
writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F)));
247+
writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
248+
writeByte((uint8_t)(0x80 | (code & 0x3F)));
249+
} else {
250+
writeByte(0xEF);
251+
writeByte(0xBF);
252+
writeByte(0xBD);
253+
}
254+
}
255+
}
210256

211257
int64_t BufferedIndexOutput::getFilePointer() const{
212258
return bufferStart + bufferPosition;

src/core/CLucene/store/IndexOutput.h

+3
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ class CLUCENE_EXPORT IndexOutput:LUCENE_BASE{
8484
template<typename T>
8585
void writeSChars(const T* s, int32_t length);
8686

87+
template<typename T>
88+
void writeSCharsOrigin(const T* s, int32_t length);
89+
8790
/** Closes this stream to further operations. */
8891
virtual void close() = 0;
8992

0 commit comments

Comments
 (0)