Igo-0.4.3の辞書引きにDAWGを試す
Igo-0.4.3: 若干のパフォーマンス向上 - sileの日記の続き。
Gomoku(0.0.4)では辞書引き部分*1にDAWGを使っているので、それもIgoに取り込んで処理速度の変化を図ってみた。
比較
諸々の条件は前回と同様。
今回は新たにIgoのDAWG版が加わる。
総処理時間(1) | 起動+行読み込み+辞書ロード 時間(2) | 解析時間(1 - 2) | |
MeCab(0.98) | 15.381s | 0.101s | 15.280s |
Gomoku(0.0.4) | 16.458s | 0.565s | 15.893s |
Igo(0.4.2) | 20.351s | 0.638s | 19.713s |
Igo(0.4.3) | 18.792s | 0.638s | 18.154s |
Igo-DAWG(0.4.3) | 17.168s | 0.638s | 16.529s |
ただこの変更はバイナリ辞書のフォーマットが変わってしまうので、少なくとも当面は正式なリリースとして公開することはしない。
DAWG版作成方法
手順のメモ。
基本的には、従来使っていた辞書引きクラスをdic-androidにあるDAWGのそれに置き換えるだけ。
# 1: DAWG版へのパッチ適用 $ tar zxvf igo-0.4.3-src.tar.gz $ patch -p0 -E < igo-0.4.3-dawg.patch # igo-0.4.3-dawg.patch に関しては末尾を参照 # 2: ソースビルド $ cd igo-0.4.3-src $ ant # 3: 辞書構築 # - 単語名のリストが標準出力に出力されるので、それを保存しておく $ java -cp igo-0.4.3.jar net.reduls.igo.bin.BuildDic バイナリ辞書 テキスト辞書 > word.list # 4: 辞書引き用のインデックスを作成する # 4-1: dic-androidのソースを取得 $ wget https://download.github.com/sile-dic-android-v0.0.1-0-ga45cd32.tar.gz $ tar zxvf sile-dic-android-v0.0.1-0-ga45cd32.tar.gz # 4-2: インデックス構築 $ cd sile-dic-android-a45cd32/dicbuilder/ $ sbcl # sbcl起動 ※common-lisp処理系 > (require :asdf) > (asdf:load-system :dic) ;; 単語リスト読み込み > (defvar *words* (with-open-file (in "/path/to/word.list") (loop FOR line = (read-line in nil nil) WHILE line COLLECT line))) ;; DAWGインデックス作成: Igoバイナリ辞書ディレクトリに追加 > (dic.trie.double-array:build *words* "バイナリ辞書/") > (quit) # 終了
パッチ
ファイル名はigo-0.4.3-dawg.patch:
diff -rcN igo-0.4.3-src-orig/src/net/reduls/igo/dictionary/WordDic.java igo-0.4.3-src/src/net/reduls/igo/dictionary/WordDic.java *** igo-0.4.3-src-orig/src/net/reduls/igo/dictionary/WordDic.java 2011-06-17 17:24:58.000000000 -0700 --- igo-0.4.3-src/src/net/reduls/igo/dictionary/WordDic.java 2011-06-18 12:15:57.167778148 -0700 *************** *** 3,12 **** import java.io.IOException; import java.util.List; import net.reduls.igo.trie.Searcher; import net.reduls.igo.util.FileMappedInputStream; public final class WordDic { ! private final Searcher trie; private final String data; private final int[] indices; --- 3,13 ---- import java.io.IOException; import java.util.List; import net.reduls.igo.trie.Searcher; + import net.reduls.igo.trie.DawgSearcher; import net.reduls.igo.util.FileMappedInputStream; public final class WordDic { ! private final DawgSearcher trie; private final String data; private final int[] indices; *************** *** 16,22 **** public final int[] dataOffsets; // dataOffsets[単語ID] = 単語の素性データの開始位置 public WordDic(String dataDir) throws IOException { ! trie = new Searcher(dataDir+"/word2id"); data = FileMappedInputStream.getString(dataDir+"/word.dat"); indices = FileMappedInputStream.getIntArray(dataDir+"/word.ary.idx"); --- 17,23 ---- public final int[] dataOffsets; // dataOffsets[単語ID] = 単語の素性データの開始位置 public WordDic(String dataDir) throws IOException { ! trie = new DawgSearcher(dataDir); data = FileMappedInputStream.getString(dataDir+"/word.dat"); indices = FileMappedInputStream.getIntArray(dataDir+"/word.ary.idx"); diff -rcN igo-0.4.3-src-orig/src/net/reduls/igo/dictionary/build/WordDic.java igo-0.4.3-src/src/net/reduls/igo/dictionary/build/WordDic.java *** igo-0.4.3-src-orig/src/net/reduls/igo/dictionary/build/WordDic.java 2010-03-29 02:35:02.000000000 -0700 --- igo-0.4.3-src/src/net/reduls/igo/dictionary/build/WordDic.java 2011-06-18 12:15:57.167778148 -0700 *************** *** 50,56 **** // 単語辞書からキーを集める for(File csvFile : new File(inputDir).listFiles(new onlyCsv())) collectKey(csvFile.getPath(), keyList, ""); ! final Builder bld = Builder.build(keyList); bld.save(outputDir+"/word2id"); } --- 50,56 ---- // 単語辞書からキーを集める for(File csvFile : new File(inputDir).listFiles(new onlyCsv())) collectKey(csvFile.getPath(), keyList, ""); ! final Builder bld = Builder.build(keyList); bld.save(outputDir+"/word2id"); } diff -rcN igo-0.4.3-src-orig/src/net/reduls/igo/trie/Builder.java igo-0.4.3-src/src/net/reduls/igo/trie/Builder.java *** igo-0.4.3-src-orig/src/net/reduls/igo/trie/Builder.java 2010-03-29 02:35:02.000000000 -0700 --- igo-0.4.3-src/src/net/reduls/igo/trie/Builder.java 2011-06-18 12:15:57.167778148 -0700 *************** *** 23,30 **** java.util.Collections.sort(keyList); String prev=null; for(String k : keyList) ! if(k.equals(prev)==false) ! ksList.add(new KeyStream(prev=k)); } /** --- 23,33 ---- java.util.Collections.sort(keyList); String prev=null; for(String k : keyList) ! if(k.equals(prev)==false) { ! System.out.println(k); ! ksList.add(new KeyStream(prev=k)); ! } ! } /** diff -rcN igo-0.4.3-src-orig/src/net/reduls/igo/trie/Char.java igo-0.4.3-src/src/net/reduls/igo/trie/Char.java *** igo-0.4.3-src-orig/src/net/reduls/igo/trie/Char.java 1969-12-31 16:00:00.000000000 -0800 --- igo-0.4.3-src/src/net/reduls/igo/trie/Char.java 2011-06-18 12:15:57.167778148 -0700 *************** *** 0 **** --- 1,39 ---- + package net.reduls.igo.trie; + + import java.io.DataInputStream; + import java.io.FileInputStream; + import java.io.IOException; + import java.util.List; + import java.util.ArrayList; + + public final class Char { + private final char[] charCode; + private final List<Character> arcs; + + public Char(String dictionaryDirectory) throws IOException { + final DataInputStream in = + new DataInputStream(new FileInputStream(dictionaryDirectory+"/code-map.bin")); + try { + final int codeLimit = in.readInt(); + charCode = new char[codeLimit]; + + arcs = new ArrayList<Character>(); + + for(int i=0; i < codeLimit; i++) { + charCode[i] = in.readChar(); + if(charCode[i] != 0) + arcs.add(charCode[i]); + } + } finally { + in.close(); + } + } + + public char code(char ch) { + return charCode[ch]; + } + + public List<Character> arcs() { + return arcs; + } + } diff -rcN igo-0.4.3-src-orig/src/net/reduls/igo/trie/DawgSearcher.java igo-0.4.3-src/src/net/reduls/igo/trie/DawgSearcher.java *** igo-0.4.3-src-orig/src/net/reduls/igo/trie/DawgSearcher.java 1969-12-31 16:00:00.000000000 -0800 --- igo-0.4.3-src/src/net/reduls/igo/trie/DawgSearcher.java 2011-06-18 12:15:57.167778148 -0700 *************** *** 0 **** --- 1,70 ---- + package net.reduls.igo.trie; + + import java.io.FileInputStream; + import java.io.IOException; + import java.nio.ByteBuffer; + import java.nio.channels.FileChannel; + + + /** + * DoubleArray検索用のクラス + */ + public final class DawgSearcher { + private final long[] nodes; + private final Char charcode; + + public DawgSearcher(String dictionaryDirectory) throws IOException { + final FileChannel cnl = + new FileInputStream(dictionaryDirectory+"/surface-id.bin").getChannel(); + try { + final ByteBuffer buf = cnl.map(FileChannel.MapMode.READ_ONLY, 0, cnl.size()); + final int nodeCount = buf.getInt(); + nodes = new long[nodeCount]; + buf.asLongBuffer().get(nodes); + } finally { + cnl.close(); + } + + charcode = new Char(dictionaryDirectory); + } + + public void eachCommonPrefix(CharSequence text, int start, Searcher.Callback fn) { + int node = 0; + int id = -1; + + for(int i=start;; i++) { + if(isTerminal(node)) + fn.call(start, i-start, id); + + if(i==text.length()) + return; + + final char arc = charcode.code(text.charAt(i)); + final int next = base(node)+arc; + if(chck(next) != arc) + return; + node = next; + id = nextId(id,node); + } + } + + private char chck(int node) { + return (char)((nodes[node]>>24) & 0xFFFF); + } + + private int base(int node) { + return (int)(nodes[node] & 0xFFFFFF); + } + + private boolean isTerminal(int node) { + return ((nodes[node]>>40) & 0x1) == 0x1; + } + + private int siblingTotal(int node) { + return (int)(nodes[node]>>41); + } + + private int nextId(int id, int node) { + return id + siblingTotal(node) + (isTerminal(node) ? 1 : 0); + } + }