Skip to content
This repository has been archived by the owner on Jul 16, 2022. It is now read-only.

Commit

Permalink
Merge pull request #74 from eb4j/topic/miurahr/support-utf16be
Browse files Browse the repository at this point in the history
Support UTF-16BE dictionary
  • Loading branch information
miurahr authored Mar 2, 2022
2 parents d99858a + b82bd57 commit 78e91e7
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 13 deletions.
2 changes: 1 addition & 1 deletion src/main/java/io/github/eb4j/dsl/DslDictionaryLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ private static Charset detectCharset(final Path path, final boolean isDictzip) t
} else if (bis.hasBOM(ByteOrderMark.UTF_16LE)) {
charset = StandardCharsets.UTF_16LE;
} else if (bis.hasBOM(ByteOrderMark.UTF_16BE)) {
throw new UnsupportedEncodingException("Unsupported encoding of UTF-16, Big-endian.");
charset = StandardCharsets.UTF_16BE;
} else {
charset = StandardCharsets.UTF_8;
}
Expand Down
17 changes: 13 additions & 4 deletions src/main/java/io/github/eb4j/dsl/impl/EntriesLoaderImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -350,22 +350,31 @@ private long eolSearch() throws IOException {
stream = rais;
}
long current = position();
boolean isUTF16 = StandardCharsets.UTF_16LE.equals(charset);
boolean isBE = StandardCharsets.UTF_16BE.equals(charset);
boolean isUTF16 = StandardCharsets.UTF_16LE.equals(charset) || isBE;
byte prev = 0;
while ((b = stream.read()) != -1) {
if ((byte) b != 0x0a) {
prev = (byte) b;
continue;
}
if (!isUTF16) {
// LF found when UTF-8 and ANSI charsets
return position() - current;
}
// check second byte
if (isBE) {
if ( prev == 0) {
// found LF in UTF-16BE
return position() - current;
} else {
continue;
}
}
// check second byte of Little-endian
if ((b = stream.read()) == -1) {
// eof detected after 0x0a found in UTF-16 case. data seems broken
return -1;
}
if (b != 0x00) {
// it is other than LF, just lower byte is 0x0a
continue;
}
// Found LF in UTF-16LE
Expand Down
18 changes: 10 additions & 8 deletions src/test/java/io/github/eb4j/dsl/DslProprietaryTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -126,21 +126,23 @@ static boolean apresyanExist() {
private static final String MUELLER = "/mueller/Mueller (En-Ru)_new.dsl.dz";

/**
* Test dsl file which encoding is UTF-16BE, expect throwing UnsupportedEncodingException.
* Test dsl file which encoding is UTF-16BE.
* @throws URISyntaxException
* @throws IOException
*/
@Test
@EnabledIf("muellerExist")
void loadDictionaryMueller() throws URISyntaxException, IOException {
Path dictPath = Paths.get(DslProprietaryTest.class.getResource(MUELLER).toURI());
boolean result = false;
try {
DslDictionary.loadDictionary(dictPath, null);
} catch (UnsupportedEncodingException ignored) {
result = true;
}
assertTrue(result);
DslDictionary dictionary = DslDictionary.loadDictionary(dictPath, null);
assertEquals("\u0410\u043D\u0433\u043B\u043E-\u0440\u0443\u0441\u0441\u043A\u0438\u0439 " +
"\u0441\u043B\u043E\u0432\u0430\u0440\u044C \u041C\u044E\u043B\u043B\u0435\u0440\u0430",
dictionary.getDictionaryName());
assertEquals("English", dictionary.getIndexLanguage());
assertEquals("Russian", dictionary.getContentLanguage());
DumpDslVisitor dumper = new DumpDslVisitor();
Map.Entry<String, String> entry = dictionary.lookupPredictive("hello").getEntries(dumper).get(0);
assertEquals("hello", entry.getKey());
}

static boolean muellerExist() {
Expand Down

0 comments on commit 78e91e7

Please sign in to comment.