diff --git a/README.md b/README.md index db799e3..05f14e7 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Very simple dictionary search application supporting; - LingvoDSL dz compression (.dsl.dz) - StarDict (.ifo .dict) - StarDict (.dict.dz) +- PDIC ![Application image](https://raw.githubusercontent.com/eb4j/ebviewer/main/docs/img/screen_image.png) @@ -81,6 +82,8 @@ Copyright (C) 2015-2020 Aaron Madlon-Kay Copyright (C) 2007-2015 Didier Briel +Copyright (C) 2014 wak + Copyright (C) 2008-2010 Alex Buloichik Copyright (C) 2007 Zoltan Bartko diff --git a/build.gradle.kts b/build.gradle.kts index 047f139..4814b38 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -74,6 +74,7 @@ dependencies { implementation("commons-io:commons-io:2.11.0") implementation("org.apache.commons:commons-lang3:3.12.0") implementation("tokyo.northside:url-protocol-handler:0.1.4") + implementation("com.ibm.icu:icu4j-charset:69.1") implementation("io.github.dictzip:dictzip:0.9.5") implementation("com.github.takawitter:trie4j:0.9.8") diff --git a/src/main/java/io/github/eb4j/ebview/dictionary/DictionariesManager.java b/src/main/java/io/github/eb4j/ebview/dictionary/DictionariesManager.java index b736e43..ba0fc19 100644 --- a/src/main/java/io/github/eb4j/ebview/dictionary/DictionariesManager.java +++ b/src/main/java/io/github/eb4j/ebview/dictionary/DictionariesManager.java @@ -29,12 +29,13 @@ public class DictionariesManager { protected final List factories = new ArrayList<>(); protected final List dictionaries = new ArrayList<>(); - private Stemmer stemmer; + private final Stemmer stemmer; public DictionariesManager() { factories.add(new EPWING()); factories.add(new LingvoDSL()); factories.add(new StarDict()); + factories.add(new PDic()); stemmer = new Stemmer(); } @@ -94,7 +95,9 @@ public List findWord(final String word) { if (result.size() == 0) { String[] stemmed = stemmer.doStem(word); if (stemmed.length > 1) { - result = dictionaries.stream().flatMap(dict -> doPredictiveLookup(dict, stemmed[0]).stream()).collect(Collectors.toList()); + result = dictionaries.stream() + .flatMap(dict -> doPredictiveLookup(dict, stemmed[0]).stream()) + .collect(Collectors.toList()); } } return result; diff --git a/src/main/java/io/github/eb4j/ebview/dictionary/PDic.java b/src/main/java/io/github/eb4j/ebview/dictionary/PDic.java new file mode 100644 index 0000000..c5423f5 --- /dev/null +++ b/src/main/java/io/github/eb4j/ebview/dictionary/PDic.java @@ -0,0 +1,45 @@ +package io.github.eb4j.ebview.dictionary; + +import io.github.eb4j.ebview.data.IDictionary; +import io.github.eb4j.ebview.dictionary.pdic.PdicDictionary; + +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +/** + * @author Hiroshi Miura + */ +public class PDic implements IDictionaryFactory { + /** + * Determine whether or not the supplied file is supported by this factory. + * This is intended to be a lightweight check, e.g. looking for a file + * extension. + * + * @param file The file to check + * @return Whether or not the file is supported + */ + @Override + public boolean isSupportedFile(final File file) { + return file.getPath().endsWith(".DIC") || file.getPath().endsWith(".dic"); + } + + /** + * Load the given file and return an {@link IDictionary} that wraps it. + * + * @param file The file to load + * @return An IDictionary file that can read articles from the file + */ + @Override + public Set loadDict(final File file) { + Set result = new HashSet<>(); + try { + IDictionary dictionary = new PdicDictionary(file); + result.add(dictionary); + } catch (IOException e) { + e.printStackTrace(); + } + return result; + } +} diff --git a/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicDictionary.java b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicDictionary.java new file mode 100644 index 0000000..b9596d5 --- /dev/null +++ b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicDictionary.java @@ -0,0 +1,115 @@ +package io.github.eb4j.ebview.dictionary.pdic; + +import io.github.eb4j.ebview.data.DictionaryEntry; +import io.github.eb4j.ebview.data.IDictionary; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.ArrayList; +import java.util.List; + +/** + * @author wak (Apache-2.0) + * @author Hiroshi Miura + */ +public class PdicDictionary implements IDictionary { + + static final Logger LOG = LoggerFactory.getLogger(PdicDictionary.class.getName()); + + private final File srcFile; + private final String cachePath; + private PdicInfo dicInfo; + + public PdicDictionary(final File file) throws IOException { + this.srcFile = file; + cachePath = file.getPath() + ".idx"; + final int headerSize = 256; + PdicHeader header; // ヘッダー + + ByteBuffer headerbuff = ByteBuffer.allocate(headerSize); + try (FileInputStream srcStream = new FileInputStream(srcFile); + FileChannel srcChannel = srcStream.getChannel()) { + int len = srcChannel.read(headerbuff); + srcChannel.close(); + if (len == headerSize) { + header = new PdicHeader(); + if (header.load(headerbuff) != 0) { + // Unicode辞書 かつ ver6以上のみ許容 + if ((header.version & 0xFF00) < 0x0600 || header.os != 0x20) { + LOG.warn("Unsupported dictionary version" + srcFile.getName()); + throw new RuntimeException(); + } + dicInfo = new PdicInfo(srcFile, header.header_size + header.extheader, + header.block_size * header.index_block, header.nindex2, header.index_blkbit, + header.block_size); + if (!dicInfo.readIndexBlock(cachePath)) { + LOG.warn("Failed to load dictionary index of " + srcFile.getName()); + throw new RuntimeException(); + } + dicInfo.setDicName(file.getName()); + } + } + } + } + + @Override + public String getDictionaryName() { + return dicInfo.getDicName(); + } + + /** + * Read article's text. + * + * @param word The word to look up in the dictionary + * @return List of entries. May be empty, but cannot be null. + */ + @Override + public List readArticles(final String word) { + List lists = new ArrayList<>(); + if (dicInfo.searchWord(word.toLowerCase())) { + PdicResult result = dicInfo.getResult(); + for (int i = 0; i < result.getCount(); i++) { + String disp = result.getDisp(i); + if (disp.equals("")) { + disp = result.getIndex(i); + } + StringBuilder sb = new StringBuilder(); + String phone = result.getPhone(i); + if (phone != null) { + sb.append(phone).append(" / "); + } + sb.append(result.getTrans(i)).append("
"); + String sample = result.getSample(i); + if (sample != null) { + sb.append(sample); + } + lists.add(new DictionaryEntry(disp, sb.toString(), getDictionaryName())); + } + } + return lists; + } + + /** + * Read article's text. Matching is predictive, so e.g. supplying "term" + * will return articles for "term", "terminology", "termite", etc. + * + * @param word The word to look up in the dictionary + * @return List of entries. May be empty, but cannot be null. + */ + @Override + public List readArticlesPredictive(final String word) { + return readArticles(word); + } + + /** + * Dispose IDictionary. Default is no action. + */ + @Override + public void close() { + } +} diff --git a/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicElement.java b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicElement.java new file mode 100644 index 0000000..a688c16 --- /dev/null +++ b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicElement.java @@ -0,0 +1,20 @@ +package io.github.eb4j.ebview.dictionary.pdic; + +/** + * @author wak (Apache-2.0) + * @author Hiroshi Miura + */ +@SuppressWarnings("visibilitymodifier") +final class PdicElement { + public byte mAttr = 0; + public String mIndex = null; + public String mDisp = null; + public String mTrans = null; + public String mSample = null; + public String mPhone = null; + + PdicElement() { + } + +} + diff --git a/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicHeader.java b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicHeader.java new file mode 100644 index 0000000..17321e9 --- /dev/null +++ b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicHeader.java @@ -0,0 +1,111 @@ +package io.github.eb4j.ebview.dictionary.pdic; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +/** + * @author wak (Apache-2.0) + * @author Hiroshi Miura + */ +@SuppressWarnings({"visibilitymodifier", "membername"}) +final class PdicHeader { + private static final int L_HEADERNAME = 100; // ヘッダー部文字列長 + private static final int L_DICTITLE = 40; // 辞書タイトル名長 + + // public String headername; // 辞書ヘッダータイトル + // public String dictitle; // 辞書名 + public short version; // 辞書のバージョン + // public short lword; // 見出語の最大長 + // public short ljapa; // 訳語の最大長 + public short block_size; // (256 ) 1ブロックのバイト数 固定 + public short index_block; // インデックスブロック数 + public short header_size; // ヘッダーのバイト数 + // public short index_size; // ( ) インデックスのバイト数 未使用 + + // public short nindex; // ( ) インデックスの要素の数 未使用 + // public short nblock; // ( ) 使用データブロック数 未使用 + // public int nword; // 登録単語数 + + // public byte dicorder; // 辞書の順番 + // public byte dictype; // 辞書の種別 + + public byte attrlen; // 単語属性の長さ + public byte os; // OS + public boolean index_blkbit; // false:16bit, true:32bit + public int extheader; // 拡張ヘッダーサイズ + public int nindex2; // インデックス要素の数 + // public int nblock2; // 使用データブロック数 + + // public int update_count; // 辞書更新回数 + // public String dicident; // 辞書識別子 + + /** + * コンストラクタ. + */ + PdicHeader() { + } + + /** + * @param header_block ヘッダーデータ部分 + * @return 辞書バージョン + */ + public int load(final ByteBuffer header_block) throws RuntimeException { + int ret = 0; + // Charset sjisset = Charset.forName("X-SJIS"); + + byte[] headernamebuff = new byte[L_HEADERNAME]; + byte[] dictitlebuff = new byte[L_DICTITLE]; + + header_block.flip(); + header_block.order(ByteOrder.LITTLE_ENDIAN); + header_block.get(headernamebuff); + // headername = sjisset.decode(ByteBuffer.wrap(headernamebuff)).toString(); + header_block.get(dictitlebuff); + // dictitle = sjisset.decode(ByteBuffer.wrap(dictitlebuff)).toString(); + version = header_block.getShort(); + if ((version & 0xFF00) == 0x0500 || (version & 0xFF00) == 0x0600) { + header_block.getShort(); // lword + header_block.getShort(); // ljapa + + block_size = header_block.getShort(); + index_block = header_block.getShort(); + header_size = header_block.getShort(); + header_block.getShort(); // index_size + header_block.getShort(); // empty_block + header_block.getShort(); // nindex + header_block.getShort(); // nblock + + header_block.getInt(); // nword + + header_block.get(); // dicorder + header_block.get(); // dictype + attrlen = header_block.get(); + os = header_block.get(); + + header_block.getInt(); // ole_number + + // lid_dummy + header_block.getShort(); + header_block.getShort(); + header_block.getShort(); + header_block.getShort(); + header_block.getShort(); + + index_blkbit = (header_block.get() != 0); + header_block.get(); // dummy0 + extheader = header_block.getInt(); + header_block.getInt(); //empty_block2 + nindex2 = header_block.getInt(); + header_block.getInt(); // nblock2 + + // 固定部分チェック + if (attrlen == 1) { + ret = version >> 8; + } + } else { + throw new RuntimeException("Unsupported format"); + } + return ret; + } + +} diff --git a/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicInfo.java b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicInfo.java new file mode 100644 index 0000000..c08b2a1 --- /dev/null +++ b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicInfo.java @@ -0,0 +1,699 @@ +package io.github.eb4j.ebview.dictionary.pdic; + +import com.ibm.icu.charset.CharsetICU; +import org.apache.commons.lang3.ArrayUtils; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.util.WeakHashMap; + +/** + * @author wak (Apache-2.0) + * @author Hiroshi Miura + */ +@SuppressWarnings("membername") +class PdicInfo { + protected File m_file; + protected int m_bodyptr; + protected PdicResult mSearchResult; + + protected int m_start; + protected int m_size; + protected int m_blockbits; + protected int m_nindex; + protected int m_blocksize; + protected boolean m_match; + protected int m_searchmax; // 最大検索件数 + protected String m_dicname; // 辞書名 + + protected int[] mIndexPtr; + + protected Charset mMainCharset; + protected Charset mPhoneCharset; + protected WeakHashMap mEncodeCache = new WeakHashMap<>(); + + protected AnalyzeBlock mAnalyze; + protected int mLastIndex = 0; + protected PdicInfoCache mPdicInfoCache; + + private RandomAccessFile mSrcStream = null; + + @SuppressWarnings("avoidinlineconditionals") + PdicInfo(final File file, final int start, final int size, final int nindex, final boolean blockbits, + final int blocksize) { + m_file = file; + m_start = start; + m_size = size; + m_nindex = nindex; + m_blockbits = (blockbits) ? 4 : 2; + m_blocksize = blocksize; + m_searchmax = 10; + + mSearchResult = new PdicResult(); + mPhoneCharset = CharsetICU.forNameICU("BOCU-1"); + mMainCharset = CharsetICU.forNameICU("BOCU-1"); + try { + mSrcStream = new RandomAccessFile(m_file, "r"); + mAnalyze = new AnalyzeBlock(); + mPdicInfoCache = new PdicInfoCache(mSrcStream, m_start, m_size); + } catch (FileNotFoundException ignored) { + } + } + + /** + * byte配列の本文文字列をCharBufferに変換する. + */ + static CharBuffer decodetoCharBuffer(final Charset cs, final byte[] array, final int pos, final int len) { + return cs.decode(ByteBuffer.wrap(array, pos, len)); + } + + /** + * 本文の文字列をByteBufferに変換する. + */ + static ByteBuffer encodetoByteBuffer(final Charset cs, final String str) { + return cs.encode(str); + } + + /** + * インデックス領域を検索. + * + * @return index of block + */ + public int searchIndexBlock(final String word) { + int min = 0; + int max = m_nindex - 1; + + ByteBuffer __word = mEncodeCache.get(word); + if (__word == null) { + __word = encodetoByteBuffer(mMainCharset, word); + mEncodeCache.put(word, __word); + } + int limit = __word.limit(); + byte[] _word = new byte[limit]; + System.arraycopy(__word.array(), 0, _word, 0, limit); + int _wordlen = _word.length; + + int[] indexPtr = mIndexPtr; + int blockbits = m_blockbits; + PdicInfoCache pdicInfoCache = mPdicInfoCache; + + for (int i = 0; i < 32; i++) { + if ((max - min) <= 1) { + return min; + } + final int look = (int) (((long) min + max) / 2); + final int len = indexPtr[look + 1] - indexPtr[look] - blockbits; + final int comp = pdicInfoCache.compare(_word, 0, _wordlen, indexPtr[look], len); + if (comp < 0) { + max = look; + } else if (comp > 0) { + min = look; + } else { + return look; + } + } + return min; + } + + /** + * Read index blocks. + * + * @return true when successfully read block, otherwise false. + */ + public boolean readIndexBlock(final String indexcache) { + if (mSrcStream != null) { + m_bodyptr = m_start + m_size; // 本体位置=( index開始位置+インデックスのサイズ) + if (indexcache != null) { + try (FileInputStream fis = new FileInputStream(indexcache)) { + byte[] buff = new byte[(m_nindex + 1) * 4]; + int readlen = fis.read(buff); + if (readlen == buff.length) { + final int indexlen = m_nindex; + final int[] indexptr = new int[m_nindex + 1]; + mIndexPtr = indexptr; + int ptr = 0; + for (int i = 0; i <= indexlen; i++) { + int b; + int dat; + b = buff[ptr++]; + b &= 0xFF; + dat = b; + b = buff[ptr++]; + b &= 0xFF; + dat |= (b << 8); + b = buff[ptr++]; + b &= 0xFF; + dat |= (b << 16); + b = buff[ptr++]; + b &= 0xFF; + dat |= (b << 24); + indexptr[i] = dat; + } + return true; + } + } catch (IOException ignored) { + } + } + + // インデックスの先頭から見出し語のポインタを拾っていく + final int nindex = m_nindex; + final int[] indexPtr = new int[nindex + 1]; // インデックスポインタの配列確保 + mIndexPtr = indexPtr; + if (mPdicInfoCache.createIndex(m_blockbits, nindex, indexPtr)) { + byte[] buff = new byte[indexPtr.length * 4]; + int p = 0; + for (int c = 0; c <= nindex; c++) { + int data = indexPtr[c]; + buff[p++] = (byte) (data & 0xFF); + data >>= 8; + buff[p++] = (byte) (data & 0xFF); + data >>= 8; + buff[p++] = (byte) (data & 0xFF); + data >>= 8; + buff[p++] = (byte) (data & 0xFF); + } + if (indexcache != null) { + try (FileOutputStream fos = new FileOutputStream(indexcache)) { + fos.write(buff, 0, buff.length); + } catch (IOException ignored) { + } + } + return true; + } + } + mIndexPtr = null; + return false; + } + + /** + * num個目の見出し語の実体が入っているブロック番号を返す. + */ + public int getBlockNo(final int num) { + int blkptr = mIndexPtr[num] - m_blockbits; + mLastIndex = num; + if (m_blockbits == 4) { + return mPdicInfoCache.getInt(blkptr); + } else { + return mPdicInfoCache.getShort(blkptr); + } + } + + /** + * 次の0までの長さを返す. + * + * @param array target byte array + * @param pos start position + * @return length of index. + */ + static int getLengthToNextZero(final byte[] array, final int pos) { + return ArrayUtils.indexOf(array, (byte) 0, pos) - pos; + // int len = 0; + // while (array[pos + len] != 0) + // len++; + // return len; + } + + boolean isMatch() { + return m_match; + } + + public String getFilename() { + return m_file.getName(); + } + + public int getSearchMax() { + return m_searchmax; + } + + public void setSearchMax(final int m) { + m_searchmax = m; + } + + public void setDicName(final String b) { + m_dicname = b; + } + + public String getDicName() { + return m_dicname; + } + + // 単語を検索する + public boolean searchWord(final String _word) { + // 検索結果クリア + int cnt = 0; + mSearchResult.clear(); + + int ret = searchIndexBlock(_word); + + boolean match = false; + + boolean searchret = false; + while (true) { + // 最終ブロックは超えない + if (ret < m_nindex) { + // 該当ブロック読み出し + int block = getBlockNo(ret++); + byte[] pblk = readBlockData(block); + if (pblk != null) { + mAnalyze.setBuffer(pblk); + mAnalyze.setSearch(_word); + searchret = mAnalyze.searchWord(); + // 未発見でEOBの時のみもう一回、回る + if (!searchret && mAnalyze.mEob) { + continue; + } + } + } + // 基本一回で抜ける + break; + } + if (searchret) { + // 前方一致するものだけ結果に入れる + do { + PdicElement res = mAnalyze.getRecord(); + if (res == null) { + break; + } + // 完全一致するかチェック + if (res.mIndex.compareTo(_word) == 0) { + match = true; + } + mSearchResult.add(res); + + cnt++; + // 取得最大件数超えたら打ち切り + } while (cnt < m_searchmax && hasMoreResult(true)); + } + return match; + } + + // 前方一致する単語の有無を返す + boolean searchPrefix(final String _word) { + int ret = searchIndexBlock(_word); + + for (int blk = 0; blk < 2; blk++) { + // 最終ブロックは超えない + if (ret + blk >= m_nindex) { + break; + } + int block = getBlockNo(ret + blk); + + // 該当ブロック読み出し + byte[] pblk = readBlockData(block); + + if (pblk != null) { + mAnalyze.setBuffer(pblk); + mAnalyze.setSearch(_word); + + if (mAnalyze.searchWord()) { + return true; + } + } + } + return false; + } + + PdicResult getResult() { + return mSearchResult; + } + + public PdicResult getMoreResult() { + mSearchResult.clear(); + if (mAnalyze != null) { + int cnt = 0; + // 前方一致するものだけ結果に入れる + while (cnt < m_searchmax && hasMoreResult(true)) { + PdicElement res = mAnalyze.getRecord(); + if (res == null) { + break; + } + mSearchResult.add(res); + cnt++; + } + } + return mSearchResult; + } + + public boolean hasMoreResult(final boolean incrementptr) { + boolean result = mAnalyze.hasMoreResult(incrementptr); + if (!result) { + if (mAnalyze.isEob()) { // EOBなら次のブロック読み出し + int nextindex = mLastIndex + 1; + // 最終ブロックは超えない + if (nextindex < m_nindex) { + int block = getBlockNo(nextindex); + + // 該当ブロック読み出し + byte[] pblk = readBlockData(block); + + if (pblk != null) { + mAnalyze.setBuffer(pblk); + result = mAnalyze.hasMoreResult(incrementptr); + } + } + } + } + return result; + } + + /** + * データブロックを読み込み. + * + * @param blkno + * @return 読み込まれたデータブロック + */ + byte[] readBlockData(final int blkno) { + byte[] buff = new byte[0x200]; + byte[] pbuf = buff; + try { + mSrcStream.seek(m_bodyptr + (long) blkno * m_blocksize); + + // 1ブロック分読込(1セクタ分先読み) + if (mSrcStream.read(pbuf, 0, 0x200) < 0) { + return null; + } + + // 長さ取得 + int len = ((int) (pbuf[0])) & 0xFF; + len |= (((int) (pbuf[1])) & 0xFF) << 8; + + // ブロック長判定 + if ((len & 0x8000) != 0) { // 32bit + len &= 0x7FFF; + } + if (len > 0) { + // ブロック不足分読込 + if (len * m_blocksize > 0x200) { + pbuf = new byte[m_blocksize * len]; + System.arraycopy(buff, 0, pbuf, 0, 0x200); + if (mSrcStream.read(pbuf, 0x200, len * m_blocksize - 0x200) < 0) { + return null; + } + } + } else { + pbuf = null; + } + return pbuf; + } catch (IOException ignored) { + } + return null; + } + + final class AnalyzeBlock { + private byte[] mBuff; + private boolean mLongfield; + private byte[] mWord; + private int mFoundPtr = -1; + private int mNextPtr = -1; + private final byte[] mCompbuff = new byte[1024]; + private int mCompLen = 0; + private boolean mEob = false; + + public AnalyzeBlock() { + } + + public void setBuffer(final byte[] buff) { + mBuff = buff; + mLongfield = ((buff[1] & 0x80) != 0); + ByteBuffer mBB = ByteBuffer.wrap(buff); + mBB.order(ByteOrder.LITTLE_ENDIAN); + mNextPtr = 2; + mEob = false; + mCompLen = 0; + } + + public void setSearch(final String word) { + ByteBuffer __word = encodetoByteBuffer(mMainCharset, word); + mEncodeCache.put(word, __word); + mWord = new byte[__word.limit()]; + System.arraycopy(__word.array(), 0, mWord, 0, __word.limit()); + } + + public boolean isEob() { + return mEob; + } + + /** + * ブロックデータの中から指定語を探す. + */ + public boolean searchWord() { + final byte[] _word = mWord; + final byte[] buff = mBuff; + final boolean longfield = mLongfield; + final byte[] compbuff = mCompbuff; + final int wordlen = _word.length; + + mFoundPtr = -1; + + // 訳語データ読込 + int ptr = mNextPtr; + mNextPtr = -1; + while (true) { + int flen = 0; + int retptr = ptr; + int b; + + b = buff[ptr++]; + flen |= (b & 0xFF); + + b = buff[ptr++]; + b <<= 8; + flen |= (b & 0xFF00); + + if (longfield) { + b = buff[ptr++]; + b <<= 16; + flen |= (b & 0xFF0000); + + b = buff[ptr++]; + b <<= 24; + flen |= (b & 0x7F000000); + } + if (flen == 0) { + mEob = true; + break; + } + int qtr = ptr; + ptr += flen + 1; + ptr++; + + + // 圧縮長 + int complen = (int) buff[qtr++]; + complen &= 0xFF; + + // 見出し語属性 skip + qtr++; + + // 見出し語圧縮位置保存 + while ((compbuff[complen++] = buff[qtr++]) != 0) ; + + // 見出し語の方が短ければ不一致 + if (complen < wordlen) { + continue; + } + + + // 前方一致で比較 + boolean equal = true; + for (int i = 0; i < wordlen; i++) { + + if (compbuff[i] != _word[i]) { + equal = false; + int cc = compbuff[i]; + cc &= 0xFF; + int cw = _word[i]; + cw &= 0xFF; + // 超えてたら打ち切る + if (cc > cw) { + return false; + } + break; + } + } + if (equal) { + mFoundPtr = retptr; + mNextPtr = ptr; + mCompLen = complen - 1; + return true; + } + } + return false; + } + + /** + * 最後の検索結果の単語を返す. + * + * @return search result + */ + PdicElement getRecord() { + if (mFoundPtr == -1) { + return null; + } + final PdicElement res = new PdicElement(); + + res.mIndex = decodetoCharBuffer(mMainCharset, mCompbuff, 0, mCompLen).toString(); + // ver6対応 見出し語が、<検索インデックス><表示用文字列>の順に + // 設定されていてるので、分割する。 + // それ以前のverではdispに空文字列を保持させる。 + + final String indexstr = res.mIndex; + final int tab = indexstr.indexOf('\t'); + if (tab == -1) { + res.mDisp = ""; + } else { + res.mIndex = indexstr.substring(0, tab); + res.mDisp = indexstr.substring(tab + 1); + } + + final byte[] buff = mBuff; + final boolean longfield = mLongfield; + byte attr = 0; + + // 訳語データ読込 + int ptr = mFoundPtr; + + if (longfield) { + ptr += 4; + } else { + ptr += 2; + } + int qtr = ptr; + + // 圧縮長 + // int complen = buff[qtr++]; + // complen &= 0xFF; + qtr++; + + // 見出し語属性 skip + attr = buff[qtr++]; + + while (buff[qtr++] != 0) { + // 見出し語 skip + } + + // 訳語 + if ((attr & 0x10) != 0) { // 拡張属性ありの時 + int trnslen = getLengthToNextZero(buff, qtr); + res.mTrans = decodetoCharBuffer(mMainCharset, buff, qtr, trnslen).toString().replace("\r", ""); + qtr += trnslen; // 次のNULLまでスキップ + + // 拡張属性取得 + byte eatr; + while (((eatr = buff[qtr++]) & 0x80) == 0) { + if ((eatr & (0x10 | 0x40)) == 0) { // バイナリOFF&圧縮OFFの場合 + if ((eatr & 0x0F) == 0x01) { // 用例 + int len = getLengthToNextZero(buff, qtr); + res.mSample = decodetoCharBuffer(mMainCharset, buff, qtr, len).toString().replace("\r", ""); + qtr += len; // 次のNULLまでスキップ + } else if ((eatr & 0x0F) == 0x02) { // 発音 + int len = getLengthToNextZero(buff, qtr); + res.mPhone = decodetoCharBuffer(mPhoneCharset, buff, qtr, len).toString(); + qtr += len; // 次のNULLまでスキップ + } + } else { + // バイナリ属性か圧縮属性が来たら打ち切り + break; + } + } + } else { + // 残り全部が訳文 + res.mTrans = decodetoCharBuffer(mMainCharset, buff, qtr, mNextPtr - qtr).toString().replace("\r", ""); + } + return res; + } + + // 次の項目が検索語に前方一致するかチェックする + public boolean hasMoreResult(final boolean incrementptr) { + byte[] _word; + final byte[] buff = mBuff; + final boolean longfield = mLongfield; + final byte[] compbuff = mCompbuff; + + // next search + if (mFoundPtr == -1) { + return false; + } + _word = mWord; + + int wordlen = _word.length; + + // 訳語データ読込 + int ptr = mNextPtr; + + int retptr = ptr; + int flen; + int b; + + b = buff[ptr++]; + flen = (b & 0xFF); + + b = buff[ptr++]; + b <<= 8; + flen |= (b & 0xFF00); + + if (longfield) { + b = buff[ptr++]; + b <<= 16; + flen |= (b & 0xFF0000); + + b = buff[ptr++]; + b <<= 24; + flen |= (b & 0x7F000000); + } + if (flen == 0) { + mEob = true; + return false; + } + int qtr = ptr; + ptr += flen + 1; + ptr++; + + // 圧縮長 + int complen = buff[qtr++]; + complen &= 0xFF; + + // 見出し語属性 skip + qtr++; + + // 見出し語圧縮位置保存 + while ((compbuff[complen++] = buff[qtr++]) != 0) ; + + // 見出し語の方が短ければ不一致 + if (complen < wordlen) { + return false; + } + + // 前方一致で比較 + boolean equal = true; + for (int i = 0; i < wordlen; i++) { + if (compbuff[i] != _word[i]) { + equal = false; + int cc = compbuff[i]; + cc &= 0xFF; + int cw = _word[i]; + cw &= 0xFF; + // 超えてたら打ち切る + if (cc > cw) { + return false; + } + break; + } + } + if (equal && incrementptr) { + mFoundPtr = retptr; + mNextPtr = ptr; + mCompLen = complen - 1; + } + return equal; + } + } +} diff --git a/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicInfoCache.java b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicInfoCache.java new file mode 100644 index 0000000..5f4cd90 --- /dev/null +++ b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicInfoCache.java @@ -0,0 +1,254 @@ +package io.github.eb4j.ebview.dictionary.pdic; + +import com.ibm.icu.charset.CharsetICU; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.lang.ref.WeakReference; +import java.util.WeakHashMap; + +/** + * @author wak (Apache-2.0) + * @author Hiroshi Miura + */ +class PdicInfoCache { + private final boolean mFix; + private final int mBlockSize; + private final RandomAccessFile mFile; + private final int mStart; + private final int mSize; + private final WeakHashMap> mMap = new WeakHashMap<>(); + private byte[] mFixedBuffer; + + PdicInfoCache(final RandomAccessFile file, final int start, final int size) { + mFile = file; + mStart = start; + mSize = size; + if (mSize < 1024 * 512) { + mFix = true; + mBlockSize = mSize; + } else { + mFix = false; + mBlockSize = 1024; + } + } + + byte[] getSegment(final int segment) { + byte[] segmentdata = null; + + if (mFix) { + if (mFixedBuffer == null) { + mFixedBuffer = new byte[mSize]; + try { + mFile.seek(mStart); + if (mFile.read(mFixedBuffer, 0, mSize) >= 0) { + return mFixedBuffer; + } + } catch (IOException ignored) { + } + } + } + + WeakReference ref = mMap.get(segment); + if (ref != null) { + segmentdata = ref.get(); + } + if (segmentdata == null) { + segmentdata = new byte[mBlockSize]; + try { + mFile.seek(mStart + (long) segment * mBlockSize); + int len = mFile.read(segmentdata, 0, mBlockSize); + if (len == mBlockSize || len == mSize % mBlockSize) { + mMap.put(segment, new WeakReference<>(segmentdata)); + } else { + return null; + } + } catch (IOException e) { + return null; + } + } + return segmentdata; + } + + + public int getShort(final int ptr) { + int segment = ptr / mBlockSize; + int address = ptr % mBlockSize; + byte[] segmentdata = getSegment(segment++); + + int dat = 0; + if (segmentdata != null) { + int b = 0; + b = segmentdata[address++]; + b &= 0xFF; + dat |= b; + + if (address >= mBlockSize) { + address %= mBlockSize; + segmentdata = getSegment(segment); + } + b = segmentdata[address]; + b &= 0xFF; + dat |= (b << 8); + } + return dat; + } + + public int getInt(final int ptr) { + int segment = ptr / mBlockSize; + int address = ptr % mBlockSize; + byte[] segmentdata = getSegment(segment++); + + int dat = 0; + if (segmentdata != null) { + int b = 0; + b = segmentdata[address++]; + b &= 0xFF; + dat |= b; + if (address >= mBlockSize) { + address %= mBlockSize; + segmentdata = getSegment(segment++); + } + b = segmentdata[address++]; + b &= 0xFF; + dat |= (b << 8); + if (address >= mBlockSize) { + address %= mBlockSize; + segmentdata = getSegment(segment++); + } + b = segmentdata[address++]; + b &= 0xFF; + dat |= (b << 16); + if (address >= mBlockSize) { + address %= mBlockSize; + segmentdata = getSegment(segment); + } + b = segmentdata[address]; + b &= 0x7F; + dat |= (b << 24); + } + return dat; + } + + @SuppressWarnings("finalparameters") + private static int compareArrayAsUnsigned(byte[] aa, int pa, int la, byte[] ab, int pb, int lb) { + while (la-- > 0) { + short sa = aa[pa++]; + if (lb-- > 0) { + short sb = ab[pb++]; + if (sa != sb) { + sa &= 0xFF; + sb &= 0xFF; + return (sa - sb); + } + } else { + return 1; + } + } + if (lb > 0) { + short sb = ab[pb]; + if (sb == 0x09) { // 比較対象の'\t'は'\0'とみなす + return 0; + } + return -1; + } + return 0; + } + + /** + * + * @param aa + * @param pa + * @param la + * @param ptr + * @param len + * @return + */ + @SuppressWarnings("finalparameters") + public int compare(final byte[] aa, final int pa, final int la, final int ptr, final int len) { + int segment = ptr / mBlockSize; + int address = ptr % mBlockSize; + byte[] segmentdata = getSegment(segment++); + + if (segmentdata == null) { + return -1; + } + + if (len < 0) { + return 1; + } + + if (address + len < mBlockSize) { + PdicInfo.decodetoCharBuffer(CharsetICU.forNameICU("BOCU-1"), segmentdata, address, len); + return compareArrayAsUnsigned(aa, pa, la, segmentdata, address, len); + } else { + int lena = mBlockSize - address; + int leno = Math.min(la, lena); + int ret = compareArrayAsUnsigned(aa, pa, leno, segmentdata, address, lena); + PdicInfo.decodetoCharBuffer(CharsetICU.forNameICU("BOCU-1"), segmentdata, address, lena); + if (ret != 0) { + return ret; + } + if (la < lena) { + return -1; + } + address = 0; + segmentdata = getSegment(segment); + PdicInfo.decodetoCharBuffer(CharsetICU.forNameICU("BOCU-1"), segmentdata, address, len - lena); + return compareArrayAsUnsigned(aa, pa + lena, la - lena, segmentdata, address, len - lena); + } + } + + + /** + * Create index of words. + * @param blockbits + * @param nindex + * @param indexPtr + * @return true when success, otherwise false. + */ + public boolean createIndex(final int blockbits, final int nindex, final int[] indexPtr) { + // インデックスの先頭から見出し語のポインタを拾っていく + int blocksize = 64 * 1024; + int[] params = new int[]{0, 0, nindex, blocksize, blockbits, 1, 0}; + + boolean hasNext = true; + for (int i = 0; hasNext; i++) { + hasNext = countIndexWords(params, getSegment(i), indexPtr); + } + indexPtr[params[0]] = params[1] + blockbits; // ターミネータを入れておく + return true; + } + + private boolean countIndexWords(final int[] params, final byte[] buff, final int[] indexPtr) { + int curidx = params[0]; + int curptr = params[1]; + int max = params[2]; + int buffmax = params[3]; + int blockbits = params[4]; + int found = params[5]; + int ignore = params[6]; + + int i = 0; + + for (; i < buffmax && curidx < max; i++) { + if (ignore > 0) { + ignore--; + } else if (found != 0) { + int ptr = curptr + i + blockbits; // ブロック番号サイズポインタを進める + indexPtr[curidx++] = ptr; // 見出し語部分のポインタを保存 + ignore = blockbits - 1; + found = 0; + } else if (buff[i] == 0) { + found = 1; + } + } + + params[0] = curidx; + params[1] = curptr + i; + params[5] = found; + params[6] = ignore; + return curidx < max; + } + +} diff --git a/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicResult.java b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicResult.java new file mode 100644 index 0000000..2db71a9 --- /dev/null +++ b/src/main/java/io/github/eb4j/ebview/dictionary/pdic/PdicResult.java @@ -0,0 +1,41 @@ +package io.github.eb4j.ebview.dictionary.pdic; + +import java.util.ArrayList; + +/** + * @author wak (Apache-2.0) + * @author Hiroshi Miura + */ +final class PdicResult extends ArrayList { + + private static final long serialVersionUID = -7784622190169021306L; + + public int getCount() { + return size(); + } + + public String getIndex(final int idx) { + return get(idx).mIndex; + } + + public String getDisp(final int idx) { + return get(idx).mDisp; + } + + public byte getAttr(final int idx) { + return get(idx).mAttr; + } + + public String getTrans(final int idx) { + return get(idx).mTrans; + } + + public String getPhone(final int idx) { + return get(idx).mPhone; + } + + public String getSample(final int idx) { + return get(idx).mSample; + } + +}