diff --git a/src/main/java/io/github/eb4j/dsl/DslDictionaryLoader.java b/src/main/java/io/github/eb4j/dsl/DslDictionaryLoader.java index 8a7c3de..74daf03 100644 --- a/src/main/java/io/github/eb4j/dsl/DslDictionaryLoader.java +++ b/src/main/java/io/github/eb4j/dsl/DslDictionaryLoader.java @@ -246,7 +246,7 @@ private static Charset detectCharset(final Path path, final boolean isDictzip) t } else if (bis.hasBOM(ByteOrderMark.UTF_16LE)) { charset = StandardCharsets.UTF_16LE; } else if (bis.hasBOM(ByteOrderMark.UTF_16BE)) { - throw new UnsupportedEncodingException("Unsupported encoding of UTF-16, Big-endian."); + charset = StandardCharsets.UTF_16BE; } else { charset = StandardCharsets.UTF_8; } diff --git a/src/main/java/io/github/eb4j/dsl/impl/EntriesLoaderImpl.java b/src/main/java/io/github/eb4j/dsl/impl/EntriesLoaderImpl.java index e0a7c4f..a33794a 100644 --- a/src/main/java/io/github/eb4j/dsl/impl/EntriesLoaderImpl.java +++ b/src/main/java/io/github/eb4j/dsl/impl/EntriesLoaderImpl.java @@ -350,22 +350,31 @@ private long eolSearch() throws IOException { stream = rais; } long current = position(); - boolean isUTF16 = StandardCharsets.UTF_16LE.equals(charset); + boolean isBE = StandardCharsets.UTF_16BE.equals(charset); + boolean isUTF16 = StandardCharsets.UTF_16LE.equals(charset) || isBE; + byte prev = 0; while ((b = stream.read()) != -1) { if ((byte) b != 0x0a) { + prev = (byte) b; continue; } if (!isUTF16) { // LF found when UTF-8 and ANSI charsets return position() - current; } - // check second byte + if (isBE) { + if ( prev == 0) { + // found LF in UTF-16BE + return position() - current; + } else { + continue; + } + } + // check second byte of Little-endian if ((b = stream.read()) == -1) { - // eof detected after 0x0a found in UTF-16 case. data seems broken return -1; } if (b != 0x00) { - // it is other than LF, just lower byte is 0x0a continue; } // Found LF in UTF-16LE diff --git a/src/test/java/io/github/eb4j/dsl/DslProprietaryTest.java b/src/test/java/io/github/eb4j/dsl/DslProprietaryTest.java index b43a3a7..abed7a3 100644 --- a/src/test/java/io/github/eb4j/dsl/DslProprietaryTest.java +++ b/src/test/java/io/github/eb4j/dsl/DslProprietaryTest.java @@ -126,7 +126,7 @@ static boolean apresyanExist() { private static final String MUELLER = "/mueller/Mueller (En-Ru)_new.dsl.dz"; /** - * Test dsl file which encoding is UTF-16BE, expect throwing UnsupportedEncodingException. + * Test dsl file which encoding is UTF-16BE. * @throws URISyntaxException * @throws IOException */ @@ -134,13 +134,15 @@ static boolean apresyanExist() { @EnabledIf("muellerExist") void loadDictionaryMueller() throws URISyntaxException, IOException { Path dictPath = Paths.get(DslProprietaryTest.class.getResource(MUELLER).toURI()); - boolean result = false; - try { - DslDictionary.loadDictionary(dictPath, null); - } catch (UnsupportedEncodingException ignored) { - result = true; - } - assertTrue(result); + DslDictionary dictionary = DslDictionary.loadDictionary(dictPath, null); + assertEquals("\u0410\u043D\u0433\u043B\u043E-\u0440\u0443\u0441\u0441\u043A\u0438\u0439 " + + "\u0441\u043B\u043E\u0432\u0430\u0440\u044C \u041C\u044E\u043B\u043B\u0435\u0440\u0430", + dictionary.getDictionaryName()); + assertEquals("English", dictionary.getIndexLanguage()); + assertEquals("Russian", dictionary.getContentLanguage()); + DumpDslVisitor dumper = new DumpDslVisitor(); + Map.Entry entry = dictionary.lookupPredictive("hello").getEntries(dumper).get(0); + assertEquals("hello", entry.getKey()); } static boolean muellerExist() {