Skip to content
This repository has been archived by the owner on Jul 16, 2022. It is now read-only.

Commit

Permalink
Merge pull request #28 from eb4j/topic/miurahr/protobuf-index-save
Browse files Browse the repository at this point in the history
Save index with protocol buffers
  • Loading branch information
miurahr authored Jan 30, 2022
2 parents 35591a7 + 01e8d75 commit 4553c4e
Show file tree
Hide file tree
Showing 10 changed files with 302 additions and 49 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ build
.gradle
out
version.properties
generated
19 changes: 17 additions & 2 deletions build.gradle.kts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import java.io.File
import java.io.FileInputStream
import java.util.Properties
import com.google.protobuf.gradle.protobuf
import com.google.protobuf.gradle.protoc

plugins {
checkstyle
Expand All @@ -10,6 +12,7 @@ plugins {
`java-library-distribution`
`maven-publish`
id("com.intershop.gradle.javacc") version "4.0.1"
id("com.google.protobuf") version "0.8.18"
id("com.github.spotbugs") version "5.0.5"
id("com.diffplug.spotless") version "6.2.0"
id("com.github.kt3k.coveralls") version "2.12.0"
Expand All @@ -23,11 +26,14 @@ repositories {
mavenCentral()
}

val protobufVersion = "3.6.1"

dependencies {
implementation("org.jetbrains:annotations:23.0.0")
implementation("com.github.takawitter:trie4j:0.9.8")
implementation("commons-io:commons-io:2.11.0")
implementation("io.github.dictzip:dictzip:0.11.1")
implementation("com.google.protobuf:protobuf-java:$protobufVersion")
testImplementation("org.codehaus.groovy:groovy-all:3.0.9")
testImplementation("org.junit.jupiter:junit-jupiter-api:5.8.2")
testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:5.8.2")
Expand Down Expand Up @@ -90,18 +96,27 @@ tasks.withType<Javadoc> {
exclude("io/github/eb4j/dsl/DslParser*",
"io/github/eb4j/dsl/Token*",
"io/github/eb4j/dsl/JavaCharStream.java",
"io/github/eb4j/dsl/ParseException.java")
"io/github/eb4j/dsl/ParseException.java",
"io/github/eb4j/dsl/DslIndexOuterClass.java"
)
}

javacc {
javaCCVersion = "7.0.10"
configs.create("dsl") {
inputFile = File("src/main/java/io/github/eb4j/dsl/DslParser.jj")
outputDir = File("build/generated/javacc")
outputDir = File("src/generated/main/java")
packageName = "io.github.eb4j.dsl"
}
}

protobuf {
protoc {
artifact = "com.google.protobuf:protoc:$protobufVersion"
generatedFilesBaseDir = File("src/generated").toString()
}
}

// we handle cases without .git directory
val home = System.getProperty("user.home")
val javaHome = System.getProperty("java.home")
Expand Down
1 change: 1 addition & 0 deletions config/checkstyle/suppressions.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
<suppress checks=".*" files="Parse.*.java"/>
<suppress checks=".*" files="JavaChar.*.java"/>
<suppress checks="ModifierOrder" files="DslArticle.java"/>
<suppress checks=".*" files="DslIndexOuterClass.java"/>
</suppressions>
11 changes: 11 additions & 0 deletions config/spotbugs/exclude.xml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,15 @@
<Bug code="EI2"/>
</Or>
</Match>
<Match>
<Or>
<Class name="io.github.eb4j.dsl.DslIndexOuterClass"/>
<Class name="io.github.eb4j.dsl.DslIndexOuterClass.DslIndex"/>
</Or>
<Or>
<Bug code="EI"/>
<Bug code="EI2"/>
<Bug code="MS"/>
</Or>
</Match>
</FindBugsFilter>
Empty file.
13 changes: 12 additions & 1 deletion src/main/java/io/github/eb4j/dsl/DslDictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,17 @@ public static DslDictionary loadDictionary(@NotNull final File file) throws IOEx
* @throws IOException raise when I/O error occurred
*/
public static DslDictionary loadDictionary(@NotNull final Path path) throws IOException {
return DslDictionaryLoader.load(path);
return DslDictionaryLoader.load(path, null);
}

/**
* Loader entry point.
* @param path dictionary file.
* @param index dictionary index file.
* @return DslDictionary object.
* @throws IOException raise when I/O error occurred
*/
public static DslDictionary loadDictionary(@NotNull final Path path, final Path index) throws IOException {
return DslDictionaryLoader.load(path, index);
}
}
139 changes: 119 additions & 20 deletions src/main/java/io/github/eb4j/dsl/DslDictionaryLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package io.github.eb4j.dsl;

import com.google.protobuf.ByteString;
import io.github.eb4j.dsl.data.DictionaryData;
import io.github.eb4j.dsl.data.DictionaryDataBuilder;
import io.github.eb4j.dsl.data.DslDictionaryProperty;
Expand All @@ -27,17 +28,23 @@
import org.dict.zip.DictZipInputStream;
import org.dict.zip.RandomAccessInputStream;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand All @@ -58,8 +65,7 @@ final class DslDictionaryLoader {
private DslDictionaryLoader() {
}

@SuppressWarnings("AvoidInlineConditionals")
static DslDictionary load(@NotNull final Path path) throws IOException {
static DslDictionary load(@NotNull final Path path, @Nullable final Path indexPath) throws IOException {
// check path
if (!path.toFile().isFile()) {
throw new IOException("Target file is not a file.");
Expand All @@ -69,17 +75,47 @@ static DslDictionary load(@NotNull final Path path) throws IOException {
throw new IOException("Error reading target file.");
}
boolean isDictzip = filename.toString().endsWith(".dz");
Charset charset = detectCharset(path, isDictzip);
byte[] eol = detectEol(path, isDictzip, charset);
Map<String, String> metadata = readMetadata(path, isDictzip, charset);
DslDictionaryProperty prop = new DslDictionaryProperty(
metadata.get("name"), metadata.get("index"), metadata.get("content"), charset, eol);
List<DslIndexOuterClass.DslIndex.Entry> entries = null;
DslDictionaryProperty prop = null;
DslIndexOuterClass.DslIndex index = getIndexFromFileAndValidate(path, indexPath);
if (index != null) {
prop = getPropertyFromIndex(index);
entries = index.getEntriesList();
}
// When there is no index or failed to validate
if (entries == null) {
Charset charset = detectCharset(path, isDictzip);
byte[] eol = detectEol(path, isDictzip, charset);
Map<String, String> metadata = readMetadata(path, isDictzip, charset);
prop = new DslDictionaryProperty(
metadata.get("name"), metadata.get("index"), metadata.get("content"), charset, eol);
entries = loadEntriesFromDslFile(path, isDictzip, prop);
buildIndexFile(path, indexPath, entries, prop);
}
DictionaryData<DslEntry> data = new DictionaryDataBuilder<DslEntry>().build(entries);
if (isDictzip) {
return new DslZipDictionary(path, data, prop);
} else {
return new DslFileDictionary(path, data, prop);
}
}

@SuppressWarnings("AvoidInlineConditionals")
static DslDictionary load(@NotNull final Path path) throws IOException {
return load(path, null);
}

@SuppressWarnings("AvoidInlineConditionals")
private static List<DslIndexOuterClass.DslIndex.Entry> loadEntriesFromDslFile(
final Path path, final boolean isDictzip, final DslDictionaryProperty prop) throws IOException {
// prepare creation of index
byte[] eol = prop.getEol();
Charset charset = prop.getCharset();
byte[] delimiter = Arrays.copyOf(eol, eol.length * 2);
System.arraycopy(eol, 0, delimiter, eol.length, eol.length);
StreamSearcher eolSearcher = new StreamSearcher(eol);
StreamSearcher cardEndSearcher = new StreamSearcher(delimiter);
DictionaryDataBuilder<DslEntry> builder = new DictionaryDataBuilder<>();
List<DslIndexOuterClass.DslIndex.Entry> entries = new ArrayList<>();
// build dictionary index
try (InputStream is = isDictzip ? new DictZipInputStream(
new RandomAccessInputStream(new RandomAccessFile(path.toFile(), "r"))) :
Expand Down Expand Up @@ -112,29 +148,89 @@ static DslDictionary load(@NotNull final Path path) throws IOException {
is.reset();
// last article
String[] tokens = headWords.split("\\r?\\n");
for (String token: tokens) {
builder.add(token, new DslEntry(articleStart, (is.available())));
for (String token : tokens) {
entries.add(DslIndexOuterClass.DslIndex.Entry.newBuilder()
.setHeadWord(token)
.setOffset(articleStart)
.setSize(is.available())
.build());
}
break;
}
String[] tokens = headWords.split("\\r?\\n");
for (String token: tokens) {
builder.add(token, new DslEntry(articleStart, (int) pos - eol.length));
for (String token : tokens) {
entries.add(DslIndexOuterClass.DslIndex.Entry.newBuilder()
.setHeadWord(token)
.setOffset(articleStart)
.setSize((int) pos - eol.length)
.build());
}
// increment to next card start
cardStart = articleStart + pos;
}
}
DictionaryData<DslEntry> data = builder.build();
if (isDictzip) {
return new DslZipDictionary(path, data, prop);
} else {
return new DslFileDictionary(path, data, prop);
return entries;
}

private static DslIndexOuterClass.DslIndex getIndexFromFileAndValidate(final Path path, final Path indexPath) {
if (indexPath != null && indexPath.toFile().canRead()) {
try (InputStream is = Files.newInputStream(indexPath)) {
DslIndexOuterClass.DslIndex index = DslIndexOuterClass.DslIndex.parseFrom(is);
if (validateIndex(path, index)) {
return index;
}
} catch (IOException ignored) {
}
}
return null;
}

private static boolean validateIndex(final Path path, final DslIndexOuterClass.DslIndex index) throws IOException {
long mtime = Files.getLastModifiedTime(path).toMillis();
long expectedMTime = index.getFileLastModifiedTime();
return (path.toString().equals(index.getFilename())
&& (Files.size(path) == index.getFilesize())
&& mtime == expectedMTime);
}

private static DslDictionaryProperty getPropertyFromIndex(final DslIndexOuterClass.DslIndex index) {
return new DslDictionaryProperty(
index.getDictionaryName(),
index.getIndexLanguage(),
index.getContentLanguage(),
Charset.forName(index.getCharset()),
index.getEol().toByteArray());
}

private static void buildIndexFile(@NotNull final Path path, @Nullable final Path indexPath,
@NotNull final List<DslIndexOuterClass.DslIndex.Entry> entries,
@NotNull final DslDictionaryProperty prop) throws IOException {
if (indexPath == null) {
// do nothing when indexPath is not specified.
return;
}
try (OutputStream os = Files.newOutputStream(indexPath, StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)) {
DslIndexOuterClass.DslIndex index = DslIndexOuterClass.DslIndex.newBuilder()
.setFilename(path.toString())
.setFilesize(Files.size(path))
.setFileLastModifiedTime(Files.getLastModifiedTime(path).toMillis())
.setDictionaryName(prop.getDictionaryName())
.setIndexLanguage(prop.getIndexLanguage())
.setContentLanguage(prop.getContentLanguage())
.setCharset(prop.getCharset().name())
.setEol(ByteString.copyFrom(prop.getEol()))
.addAllEntries(entries)
.build();
index.writeTo(os);
os.flush();
} catch (IOException ignored) {
}
}

@SuppressWarnings("AvoidInlineConditionals")
private static byte[] detectEol(final Path path, final boolean isDictzip, final Charset charset) throws IOException {
private static byte[] detectEol(final Path path, final boolean isDictzip, final Charset charset)
throws IOException {
byte[] eol;
try (InputStream bis = isDictzip ? new DictZipInputStream(
new RandomAccessInputStream(new RandomAccessFile(path.toFile(), "r"))) :
Expand All @@ -159,6 +255,7 @@ private static byte[] detectEol(final Path path, final boolean isDictzip, final
return eol;
}

@SuppressWarnings("AvoidInlineConditionals")
private static Charset detectCharset(final Path path, final boolean isDictzip) throws IOException {
Map<String, String> metadata;
Charset charset;
Expand All @@ -172,7 +269,7 @@ private static Charset detectCharset(final Path path, final boolean isDictzip) t
byte[] buf = new byte[4];
if (bis.read(buf, 0, 4) == -1) {
throw new IOException("Unexpected end of file.");
};
}
if (buf[1] == '\0') {
charset = StandardCharsets.UTF_16LE;
} else {
Expand Down Expand Up @@ -205,7 +302,9 @@ private static Charset detectCharset(final Path path, final boolean isDictzip) t
return charset;
}

private static Map<String, String> readMetadata(final Path path, final boolean isDictzip, Charset charset) throws IOException {
@SuppressWarnings("AvoidInlineConditionals")
private static Map<String, String> readMetadata(final Path path, final boolean isDictzip, final Charset charset)
throws IOException {
final Map<String, String> metadata = new HashMap<>();
try (InputStream bis = isDictzip ? new DictZipInputStream(
new RandomAccessInputStream(new RandomAccessFile(path.toFile(), "r"))) :
Expand Down
Loading

0 comments on commit 4553c4e

Please sign in to comment.