@@ -158,8 +158,8 @@ CL_NS_DEF(store)
158
158
writeChars (s, length);
159
159
}
160
160
161
- template <>
162
- void IndexOutput::writeSChars (const TCHAR* s, const int32_t length){
161
+ template <>
162
+ void IndexOutput::writeSCharsOrigin (const TCHAR* s, const int32_t length){
163
163
if ( length < 0 )
164
164
_CLTHROWA (CL_ERR_IllegalArgument, " IO Argument Error. Value must be a positive value." );
165
165
@@ -179,6 +179,40 @@ CL_NS_DEF(store)
179
179
}
180
180
}
181
181
182
+ template <>
183
+ void IndexOutput::writeSChars (const TCHAR* s, const int32_t length) {
184
+ if (length < 0 )
185
+ _CLTHROWA (CL_ERR_IllegalArgument, " IO Argument Error. Value must be a positive value." );
186
+
187
+ const int32_t end = length;
188
+ for (int32_t i = 0 ; i < end; ++i) {
189
+ auto code = (uint32_t )s[i];
190
+ if (code >= 0x00 && code <= 0x7F ) {
191
+ writeByte ((uint8_t )code);
192
+ } else if (code <= 0x7FF ) {
193
+ writeByte ((uint8_t )(0xC0 | (code >> 6 )));
194
+ writeByte ((uint8_t )(0x80 | (code & 0x3F )));
195
+ } else if (code <= 0xFFFF ) {
196
+ writeByte ((uint8_t )(0xE0 | (code >> 12 )));
197
+ writeByte ((uint8_t )(0x80 | ((code >> 6 ) & 0x3F )));
198
+ writeByte ((uint8_t )(0x80 | (code & 0x3F )));
199
+ } else if (code <= 0x10FFFF ) {
200
+ // NOTE: This is not correct UTF-8 encoding, but it is what we are doing now.
201
+ // We must differ it from previous wrong encoding code, previous code will write 3bytes characters starts with 0xF0-0xFF for 4-byte characters.
202
+ // Which will mixed with the correct 4-byte characters with UTF-8 encoding.
203
+ // This is a temporary solution, we need to find a better way to handle this.
204
+ writeByte ((uint8_t )(0x80 | (code >> 18 )));
205
+ writeByte ((uint8_t )(0x80 | ((code >> 12 ) & 0x3F )));
206
+ writeByte ((uint8_t )(0x80 | ((code >> 6 ) & 0x3F )));
207
+ writeByte ((uint8_t )(0x80 | (code & 0x3F )));
208
+ } else {
209
+ writeByte (0xEF );
210
+ writeByte (0xBF );
211
+ writeByte (0xBD );
212
+ }
213
+ }
214
+ }
215
+
182
216
template <>
183
217
void IndexOutput::writeSChars (const char * s, const int32_t length){
184
218
if ( length < 0 )
@@ -187,26 +221,38 @@ CL_NS_DEF(store)
187
221
writeBytes ((const uint8_t *)s, length);
188
222
}
189
223
190
- void IndexOutput::writeChars (const TCHAR* s, const int32_t length){
191
- if ( length < 0 )
192
- _CLTHROWA (CL_ERR_IllegalArgument, " IO Argument Error. Value must be a positive value." );
193
-
194
- const int32_t end = length;
195
- for (int32_t i = 0 ; i < end; ++i) {
196
- const int32_t code = (int32_t )s[i];
197
- if (code >= 0x01 && code <= 0x7F )
198
- writeByte ((uint8_t )code);
199
- else if (((code >= 0x80 ) && (code <= 0x7FF )) || code == 0 ) {
200
- writeByte ((uint8_t )(0xC0 | (code >> 6 )));
201
- writeByte ((uint8_t )(0x80 | (code & 0x3F )));
202
- } else {
203
- writeByte ((uint8_t )(0xE0 | (((uint32_t )code) >> 12 ))); // unsigned shift
204
- writeByte ((uint8_t )(0x80 | ((code >> 6 ) & 0x3F )));
205
- writeByte ((uint8_t )(0x80 | (code & 0x3F )));
206
- }
207
- }
208
- }
224
+ void IndexOutput::writeChars (const TCHAR* s, const int32_t length) {
225
+ if (length < 0 )
226
+ _CLTHROWA (CL_ERR_IllegalArgument, " IO Argument Error. Value must be a positive value." );
209
227
228
+ const int32_t end = length;
229
+ for (int32_t i = 0 ; i < end; ++i) {
230
+ auto code = (uint32_t )s[i];
231
+ if (code >= 0x00 && code <= 0x7F ) {
232
+ writeByte ((uint8_t )code);
233
+ } else if (code <= 0x7FF ) {
234
+ writeByte ((uint8_t )(0xC0 | (code >> 6 )));
235
+ writeByte ((uint8_t )(0x80 | (code & 0x3F )));
236
+ } else if (code <= 0xFFFF ) {
237
+ writeByte ((uint8_t )(0xE0 | (code >> 12 )));
238
+ writeByte ((uint8_t )(0x80 | ((code >> 6 ) & 0x3F )));
239
+ writeByte ((uint8_t )(0x80 | (code & 0x3F )));
240
+ } else if (code <= 0x10FFFF ) {
241
+ // NOTE: This is not correct UTF-8 encoding, but it is what we are doing now.
242
+ // We must differ it from previous wrong encoding code, previous code will write 3bytes characters starts with 0xF0-0xFF for 4-byte characters.
243
+ // Which will mixed with the correct 4-byte characters with UTF-8 encoding.
244
+ // This is a temporary solution, we need to find a better way to handle this.
245
+ writeByte ((uint8_t )(0x80 | (code >> 18 )));
246
+ writeByte ((uint8_t )(0x80 | ((code >> 12 ) & 0x3F )));
247
+ writeByte ((uint8_t )(0x80 | ((code >> 6 ) & 0x3F )));
248
+ writeByte ((uint8_t )(0x80 | (code & 0x3F )));
249
+ } else {
250
+ writeByte (0xEF );
251
+ writeByte (0xBF );
252
+ writeByte (0xBD );
253
+ }
254
+ }
255
+ }
210
256
211
257
int64_t BufferedIndexOutput::getFilePointer () const {
212
258
return bufferStart + bufferPosition;
0 commit comments