Skip to content
This repository has been archived by the owner on Jul 16, 2022. It is now read-only.

Commit

Permalink
Support PDIC/Unicode format
Browse files Browse the repository at this point in the history
- Borrowed from monodict copyright wak by apache-2.0

Signed-off-by: Hiroshi Miura <miurahr@linux.com>
  • Loading branch information
miurahr committed Sep 28, 2021
1 parent 00fc209 commit 8271de3
Show file tree
Hide file tree
Showing 10 changed files with 1,322 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Very simple dictionary search application supporting;
- LingvoDSL dz compression (.dsl.dz)
- StarDict (.ifo .dict)
- StarDict (.dict.dz)
- PDIC

![Application image](https://raw.githubusercontent.com/eb4j/ebviewer/main/docs/img/screen_image.png)

Expand Down Expand Up @@ -81,6 +82,8 @@ Copyright (C) 2015-2020 Aaron Madlon-Kay

Copyright (C) 2007-2015 Didier Briel

Copyright (C) 2014 wak

Copyright (C) 2008-2010 Alex Buloichik

Copyright (C) 2007 Zoltan Bartko
Expand Down
1 change: 1 addition & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ dependencies {
implementation("commons-io:commons-io:2.11.0")
implementation("org.apache.commons:commons-lang3:3.12.0")
implementation("tokyo.northside:url-protocol-handler:0.1.4")
implementation("com.ibm.icu:icu4j-charset:69.1")

implementation("io.github.dictzip:dictzip:0.9.5")
implementation("com.github.takawitter:trie4j:0.9.8")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public DictionariesManager() {
factories.add(new EPWING());
factories.add(new LingvoDSL());
factories.add(new StarDict());
factories.add(new PDic());
stemmer = new Stemmer();
}

Expand Down
42 changes: 42 additions & 0 deletions src/main/java/io/github/eb4j/ebview/dictionary/PDic.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package io.github.eb4j.ebview.dictionary;

import io.github.eb4j.ebview.data.IDictionary;
import io.github.eb4j.ebview.dictionary.pdic.PdicDictionary;

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

public class PDic implements IDictionaryFactory {
/**
* Determine whether or not the supplied file is supported by this factory.
* This is intended to be a lightweight check, e.g. looking for a file
* extension.
*
* @param file The file to check
* @return Whether or not the file is supported
*/
@Override
public boolean isSupportedFile(File file) {
return file.getPath().endsWith(".DIC") || file.getPath().endsWith(".dic");
}

/**
* Load the given file and return an {@link IDictionary} that wraps it.
*
* @param file The file to load
* @return An IDictionary file that can read articles from the file
*/
@Override
public Set<IDictionary> loadDict(File file) {
Set<IDictionary> result = new HashSet<>();
try {
IDictionary dictionary = new PdicDictionary(file);
result.add(dictionary);
} catch (IOException e) {
e.printStackTrace();
}
return result;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package io.github.eb4j.ebview.dictionary.pdic;

import io.github.eb4j.ebview.data.DictionaryEntry;
import io.github.eb4j.ebview.data.IDictionary;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;

public class PdicDictionary implements IDictionary {

static final Logger LOG = LoggerFactory.getLogger(PdicDictionary.class.getName());

private final File srcFile;
private final String cachePath;
private PdicInfo dicInfo;

public PdicDictionary(final File file) throws IOException {
this.srcFile = file;
cachePath = file.getPath() + ".idx";
final int headerSize = 256;
PdicHeader header; // ヘッダー

ByteBuffer headerbuff = ByteBuffer.allocate(headerSize);
try (FileInputStream srcStream = new FileInputStream(srcFile);
FileChannel srcChannel = srcStream.getChannel()) {
int len = srcChannel.read(headerbuff);
srcChannel.close();
if (len == headerSize) {
header = new PdicHeader();
if (header.load(headerbuff) != 0) {
// Unicode辞書 かつ ver6以上のみ許容
if ((header.version & 0xFF00) < 0x0600 || header.os != 0x20) {
LOG.warn("Unsupported dictionary version" + srcFile.getName());
throw new RuntimeException();
}
dicInfo = new PdicInfo(srcFile, header.header_size + header.extheader,
header.block_size * header.index_block, header.nindex2, header.index_blkbit,
header.block_size);
if (!dicInfo.readIndexBlock(cachePath)) {
LOG.warn("Failed to load dictionary index of " + srcFile.getName());
throw new RuntimeException();
}
dicInfo.SetDicName(file.getName());
}
}
}
}

@Override
public String getDictionaryName() {
return dicInfo.GetDicName();
}

/**
* Read article's text.
*
* @param word The word to look up in the dictionary
* @return List of entries. May be empty, but cannot be null.
*/
@Override
public List<DictionaryEntry> readArticles(String word) {
List<DictionaryEntry> lists = new ArrayList<>();
if (dicInfo.searchWord(word.toLowerCase())) {
PdicResult result = dicInfo.getResult();
for (int i = 0; i < result.getCount(); i ++) {
String disp = result.getDisp(i);
if (disp.equals("")) {
disp = result.getIndex(i);
}
StringBuilder sb = new StringBuilder();
String phone = result.getPhone(i);
if (phone != null) {
sb.append(phone).append(" / ");
}
sb.append(result.getTrans(i)).append("<br/>");
String sample = result.getSample(i);
if (sample != null) {
sb.append(sample);
}
lists.add(new DictionaryEntry(disp, sb.toString(), getDictionaryName()));
}
}
return lists;
}

/**
* Read article's text. Matching is predictive, so e.g. supplying "term"
* will return articles for "term", "terminology", "termite", etc.
*
* @param word The word to look up in the dictionary
* @return List of entries. May be empty, but cannot be null.
*/
@Override
public List<DictionaryEntry> readArticlesPredictive(String word) {
return readArticles(word);
}

/**
* Dispose IDictionary. Default is no action.
*/
@Override
public void close() throws IOException {
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/**
* Copyright (C) 2014 wak (Apache-2.0)
*/
package io.github.eb4j.ebview.dictionary.pdic;

final class PdicElement {
public byte mAttr = 0;
public String mIndex = null;
public String mDisp = null;
public String mTrans = null;
public String mSample = null;
public String mPhone = null;

public PdicElement() {
}

}

156 changes: 156 additions & 0 deletions src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicHeader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/**
* Copyright (C) 2014 wak (Apache-2.0)
*/
package io.github.eb4j.ebview.dictionary.pdic;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.Charset;

final class PdicHeader {
private final int L_HEADERNAME = 100; // ヘッダー部文字列長
private final int L_DICTITLE = 40; // 辞書タイトル名長

public String headername; // 辞書ヘッダータイトル
public String dictitle; // 辞書名
public short version; // 辞書のバージョン
public short lword; // 見出語の最大長
public short ljapa; // 訳語の最大長
public short block_size; // (256 ) 1ブロックのバイト数 固定
public short index_block; // インデックスブロック数
public short header_size; // ヘッダーのバイト数
public short index_size; // ( ) インデックスのバイト数 未使用

public short empty_block; // 空きブロックの先頭物理ブロック番号(ないときは-1 )
public short nindex; // ( ) インデックスの要素の数 未使用
public short nblock; // ( ) 使用データブロック数 未使用
public int nword; // 登録単語数

public byte dicorder; // 辞書の順番
public byte dictype; // 辞書の種別

public byte attrlen; // 単語属性の長さ
public byte os; // OS
public int olenumber; // OLE 用シリアル番号
public short lid_word; // ID 見出語言語

public short lid_japa; // ID 訳語部言語
public short lid_exp; // ID 用例部言語
public short lid_pron; // ID 発音記号言語
public short lid_other; // ID その他言語
public boolean index_blkbit; // false:16bit, true:32bit
public int extheader; // 拡張ヘッダーサイズ
public int empty_block2; // 空きブロック先頭物理ブロック番号
public int nindex2; // インデックス要素の数
public int nblock2; // 使用データブロック数

public int update_count; // 辞書更新回数
public String dicident; // 辞書識別子

/**
* コンストラクタ.
*/
public PdicHeader() {
}

/**
* @param header_block ヘッダーデータ部分
* @return 辞書バージョン
*/
public int load(ByteBuffer header_block) throws RuntimeException {
int ret = 0;
Charset sjisset = Charset.forName("X-SJIS");

byte[] headernamebuff = new byte[L_HEADERNAME];
byte[] dictitlebuff = new byte[L_DICTITLE];

header_block.flip();
header_block.order(ByteOrder.LITTLE_ENDIAN);
header_block.get(headernamebuff);
headername = sjisset.decode(ByteBuffer.wrap(headernamebuff))
.toString();
header_block.get(dictitlebuff);
dictitle = sjisset.decode(ByteBuffer.wrap(dictitlebuff)).toString();
version = header_block.getShort();
if ((version & 0xFF00) == 0x0500 || (version & 0xFF00) == 0x0600) {
lword = header_block.getShort();
ljapa = header_block.getShort();

block_size = header_block.getShort();
index_block = header_block.getShort();
header_size = header_block.getShort();
index_size = header_block.getShort();
empty_block = header_block.getShort();
nindex = header_block.getShort();
nblock = header_block.getShort();

nword = header_block.getInt();

dicorder = header_block.get();
dictype = header_block.get();
attrlen = header_block.get();
os = header_block.get();

olenumber = header_block.getInt();
lid_word = header_block.getShort();

lid_japa = header_block.getShort();
lid_exp = header_block.getShort();
lid_pron = header_block.getShort();
lid_other = header_block.getShort();
index_blkbit = (header_block.get() != 0);
header_block.get(); // dummy0
extheader = header_block.getInt();
empty_block2 = header_block.getInt();
nindex2 = header_block.getInt();
nblock2 = header_block.getInt();

// 固定部分チェック
if (attrlen == 1) {
ret = version >> 8;
}
} else if ((version & 0xFF00) == 0x0400) {

lword = header_block.getShort();
ljapa = header_block.getShort();

block_size = header_block.getShort();
index_block = header_block.getShort();
header_size = header_block.getShort();
index_size = header_block.getShort();
empty_block = header_block.getShort();
nindex = header_block.getShort();
nblock = header_block.getShort();

nword = header_block.getInt();

dicorder = header_block.get();
dictype = header_block.get();
attrlen = header_block.get();

olenumber = header_block.getInt();
os = header_block.get();

lid_word = header_block.getShort();
lid_japa = header_block.getShort();
lid_exp = header_block.getShort();
lid_pron = header_block.getShort();
lid_other = header_block.getShort();
extheader = header_block.getInt();
empty_block2 = header_block.getInt();
nindex2 = header_block.getInt();
nblock2 = header_block.getInt();
index_blkbit = (header_block.get() != 0);
// 固定部分チェック
if (block_size == 0x100 &&
header_size == 0x100 &&
attrlen == 1) {
ret = version >> 8;
}
} else {
throw new RuntimeException("Unsupported format");
}
return ret;
}

}
Loading

0 comments on commit 8271de3

Please sign in to comment.