This repository has been archived by the owner on Jul 16, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #35 from eb4j/topic-support-pdic
Support PDIC/Unicode format
- Loading branch information
Showing
10 changed files
with
1,294 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
package io.github.eb4j.ebview.dictionary; | ||
|
||
import io.github.eb4j.ebview.data.IDictionary; | ||
import io.github.eb4j.ebview.dictionary.pdic.PdicDictionary; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
/** | ||
* @author Hiroshi Miura | ||
*/ | ||
public class PDic implements IDictionaryFactory { | ||
/** | ||
* Determine whether or not the supplied file is supported by this factory. | ||
* This is intended to be a lightweight check, e.g. looking for a file | ||
* extension. | ||
* | ||
* @param file The file to check | ||
* @return Whether or not the file is supported | ||
*/ | ||
@Override | ||
public boolean isSupportedFile(final File file) { | ||
return file.getPath().endsWith(".DIC") || file.getPath().endsWith(".dic"); | ||
} | ||
|
||
/** | ||
* Load the given file and return an {@link IDictionary} that wraps it. | ||
* | ||
* @param file The file to load | ||
* @return An IDictionary file that can read articles from the file | ||
*/ | ||
@Override | ||
public Set<IDictionary> loadDict(final File file) { | ||
Set<IDictionary> result = new HashSet<>(); | ||
try { | ||
IDictionary dictionary = new PdicDictionary(file); | ||
result.add(dictionary); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
return result; | ||
} | ||
} |
115 changes: 115 additions & 0 deletions
115
src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicDictionary.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
package io.github.eb4j.ebview.dictionary.pdic; | ||
|
||
import io.github.eb4j.ebview.data.DictionaryEntry; | ||
import io.github.eb4j.ebview.data.IDictionary; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.nio.ByteBuffer; | ||
import java.nio.channels.FileChannel; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* @author wak (Apache-2.0) | ||
* @author Hiroshi Miura | ||
*/ | ||
public class PdicDictionary implements IDictionary { | ||
|
||
static final Logger LOG = LoggerFactory.getLogger(PdicDictionary.class.getName()); | ||
|
||
private final File srcFile; | ||
private final String cachePath; | ||
private PdicInfo dicInfo; | ||
|
||
public PdicDictionary(final File file) throws IOException { | ||
this.srcFile = file; | ||
cachePath = file.getPath() + ".idx"; | ||
final int headerSize = 256; | ||
PdicHeader header; // ヘッダー | ||
|
||
ByteBuffer headerbuff = ByteBuffer.allocate(headerSize); | ||
try (FileInputStream srcStream = new FileInputStream(srcFile); | ||
FileChannel srcChannel = srcStream.getChannel()) { | ||
int len = srcChannel.read(headerbuff); | ||
srcChannel.close(); | ||
if (len == headerSize) { | ||
header = new PdicHeader(); | ||
if (header.load(headerbuff) != 0) { | ||
// Unicode辞書 かつ ver6以上のみ許容 | ||
if ((header.version & 0xFF00) < 0x0600 || header.os != 0x20) { | ||
LOG.warn("Unsupported dictionary version" + srcFile.getName()); | ||
throw new RuntimeException(); | ||
} | ||
dicInfo = new PdicInfo(srcFile, header.header_size + header.extheader, | ||
header.block_size * header.index_block, header.nindex2, header.index_blkbit, | ||
header.block_size); | ||
if (!dicInfo.readIndexBlock(cachePath)) { | ||
LOG.warn("Failed to load dictionary index of " + srcFile.getName()); | ||
throw new RuntimeException(); | ||
} | ||
dicInfo.setDicName(file.getName()); | ||
} | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public String getDictionaryName() { | ||
return dicInfo.getDicName(); | ||
} | ||
|
||
/** | ||
* Read article's text. | ||
* | ||
* @param word The word to look up in the dictionary | ||
* @return List of entries. May be empty, but cannot be null. | ||
*/ | ||
@Override | ||
public List<DictionaryEntry> readArticles(final String word) { | ||
List<DictionaryEntry> lists = new ArrayList<>(); | ||
if (dicInfo.searchWord(word.toLowerCase())) { | ||
PdicResult result = dicInfo.getResult(); | ||
for (int i = 0; i < result.getCount(); i++) { | ||
String disp = result.getDisp(i); | ||
if (disp.equals("")) { | ||
disp = result.getIndex(i); | ||
} | ||
StringBuilder sb = new StringBuilder(); | ||
String phone = result.getPhone(i); | ||
if (phone != null) { | ||
sb.append(phone).append(" / "); | ||
} | ||
sb.append(result.getTrans(i)).append("<br/>"); | ||
String sample = result.getSample(i); | ||
if (sample != null) { | ||
sb.append(sample); | ||
} | ||
lists.add(new DictionaryEntry(disp, sb.toString(), getDictionaryName())); | ||
} | ||
} | ||
return lists; | ||
} | ||
|
||
/** | ||
* Read article's text. Matching is predictive, so e.g. supplying "term" | ||
* will return articles for "term", "terminology", "termite", etc. | ||
* | ||
* @param word The word to look up in the dictionary | ||
* @return List of entries. May be empty, but cannot be null. | ||
*/ | ||
@Override | ||
public List<DictionaryEntry> readArticlesPredictive(final String word) { | ||
return readArticles(word); | ||
} | ||
|
||
/** | ||
* Dispose IDictionary. Default is no action. | ||
*/ | ||
@Override | ||
public void close() { | ||
} | ||
} |
20 changes: 20 additions & 0 deletions
20
src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicElement.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package io.github.eb4j.ebview.dictionary.pdic; | ||
|
||
/** | ||
* @author wak (Apache-2.0) | ||
* @author Hiroshi Miura | ||
*/ | ||
@SuppressWarnings("visibilitymodifier") | ||
final class PdicElement { | ||
public byte mAttr = 0; | ||
public String mIndex = null; | ||
public String mDisp = null; | ||
public String mTrans = null; | ||
public String mSample = null; | ||
public String mPhone = null; | ||
|
||
PdicElement() { | ||
} | ||
|
||
} | ||
|
111 changes: 111 additions & 0 deletions
111
src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicHeader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
package io.github.eb4j.ebview.dictionary.pdic; | ||
|
||
import java.nio.ByteBuffer; | ||
import java.nio.ByteOrder; | ||
|
||
/** | ||
* @author wak (Apache-2.0) | ||
* @author Hiroshi Miura | ||
*/ | ||
@SuppressWarnings({"visibilitymodifier", "membername"}) | ||
final class PdicHeader { | ||
private static final int L_HEADERNAME = 100; // ヘッダー部文字列長 | ||
private static final int L_DICTITLE = 40; // 辞書タイトル名長 | ||
|
||
// public String headername; // 辞書ヘッダータイトル | ||
// public String dictitle; // 辞書名 | ||
public short version; // 辞書のバージョン | ||
// public short lword; // 見出語の最大長 | ||
// public short ljapa; // 訳語の最大長 | ||
public short block_size; // (256 ) 1ブロックのバイト数 固定 | ||
public short index_block; // インデックスブロック数 | ||
public short header_size; // ヘッダーのバイト数 | ||
// public short index_size; // ( ) インデックスのバイト数 未使用 | ||
|
||
// public short nindex; // ( ) インデックスの要素の数 未使用 | ||
// public short nblock; // ( ) 使用データブロック数 未使用 | ||
// public int nword; // 登録単語数 | ||
|
||
// public byte dicorder; // 辞書の順番 | ||
// public byte dictype; // 辞書の種別 | ||
|
||
public byte attrlen; // 単語属性の長さ | ||
public byte os; // OS | ||
public boolean index_blkbit; // false:16bit, true:32bit | ||
public int extheader; // 拡張ヘッダーサイズ | ||
public int nindex2; // インデックス要素の数 | ||
// public int nblock2; // 使用データブロック数 | ||
|
||
// public int update_count; // 辞書更新回数 | ||
// public String dicident; // 辞書識別子 | ||
|
||
/** | ||
* コンストラクタ. | ||
*/ | ||
PdicHeader() { | ||
} | ||
|
||
/** | ||
* @param header_block ヘッダーデータ部分 | ||
* @return 辞書バージョン | ||
*/ | ||
public int load(final ByteBuffer header_block) throws RuntimeException { | ||
int ret = 0; | ||
// Charset sjisset = Charset.forName("X-SJIS"); | ||
|
||
byte[] headernamebuff = new byte[L_HEADERNAME]; | ||
byte[] dictitlebuff = new byte[L_DICTITLE]; | ||
|
||
header_block.flip(); | ||
header_block.order(ByteOrder.LITTLE_ENDIAN); | ||
header_block.get(headernamebuff); | ||
// headername = sjisset.decode(ByteBuffer.wrap(headernamebuff)).toString(); | ||
header_block.get(dictitlebuff); | ||
// dictitle = sjisset.decode(ByteBuffer.wrap(dictitlebuff)).toString(); | ||
version = header_block.getShort(); | ||
if ((version & 0xFF00) == 0x0500 || (version & 0xFF00) == 0x0600) { | ||
header_block.getShort(); // lword | ||
header_block.getShort(); // ljapa | ||
|
||
block_size = header_block.getShort(); | ||
index_block = header_block.getShort(); | ||
header_size = header_block.getShort(); | ||
header_block.getShort(); // index_size | ||
header_block.getShort(); // empty_block | ||
header_block.getShort(); // nindex | ||
header_block.getShort(); // nblock | ||
|
||
header_block.getInt(); // nword | ||
|
||
header_block.get(); // dicorder | ||
header_block.get(); // dictype | ||
attrlen = header_block.get(); | ||
os = header_block.get(); | ||
|
||
header_block.getInt(); // ole_number | ||
|
||
// lid_dummy | ||
header_block.getShort(); | ||
header_block.getShort(); | ||
header_block.getShort(); | ||
header_block.getShort(); | ||
header_block.getShort(); | ||
|
||
index_blkbit = (header_block.get() != 0); | ||
header_block.get(); // dummy0 | ||
extheader = header_block.getInt(); | ||
header_block.getInt(); //empty_block2 | ||
nindex2 = header_block.getInt(); | ||
header_block.getInt(); // nblock2 | ||
|
||
// 固定部分チェック | ||
if (attrlen == 1) { | ||
ret = version >> 8; | ||
} | ||
} else { | ||
throw new RuntimeException("Unsupported format"); | ||
} | ||
return ret; | ||
} | ||
|
||
} |
Oops, something went wrong.