基本となるDoubleArrayの実装

各種アルゴリズムを試す際のベースとなるような(シンプルな)DoubleArrayが欲しくなったので作成した。

構成など

多分、DoubleArrayとしては一番単純な構成*1。
※ 以下で云う"ノード"は、"ノードのインデックス"の略のような意味合い

静的構築
- 各キーを改行区切り('\n')で保持するソート済みのファイルを入力に取り、DoubleArrayを構築する
BASE配列とCHECK配列から成る
- BASE配列: 遷移情報およびキーのIDを保持する配列
  - BASE[ノード] = 遷移のベースとなるノード
    => 遷移先ノードは、BASE[ノード] + 遷移文字、で求める
  - BASE[ノード]の値がマイナスの場合は、キーの終端を意味し、そのIDが格納されている
    => キーのIDは、BASE[ノード] x -1、で求める
- CHECK配列: 遷移の正当性をチェックするための配列
  - CHECK[ノード] = 遷移元ノード
    => 遷移元ノード = CHECK[BASE[ノード] + 遷移文字]、なら正しい遷移
検索機能
- 完全一致検索
- common-prefix-search

名前は"da"としておく。
名前を付けることにそれほど意味がないような代物だけど、なければないで他との比較の際に不便だったりするので。

現時点での性能

性能というかデータサイズと検索速度を一応測定。
入力キーセットその他は、LOUDS++の七回目と同様。
※ ただし、ベースとなる表には九回目のものを使用している

	データサイズ	検索所要時間(秒)
louds-trie (五回目)	74MB	32.1372s
louds-trie-tail (六回目)	52MB	17.0782s
louds-trie-less-node	52MB	13.4044s
louds-trie-less-node-opt	52MB	11.6995s
doar	94MB	3.19229s
doar2	61MB	4.54477s
tx	67MB	51.556s
darts	416MB	3.35106s
da	391MB	3.12151s

ソースコード

残りはひたすらソースコード。
コマンド用のファイル三つも含めて、全部で400行くらい。
エラーチェック等は不足気味。

※ 以下に出てくるCharStreamクラス、CharStreamVectorクラス、およびそれらのBuilderクラス内での使われ方に関しては、『ソート済みファイルをトライ木に見立ててレベル順(幅優先)探索を行う』も参照。
※ 追記(2010/07/09): 毎回コピー&ペーストするのが面倒になったのでgithubにリポジトリを作成(dabase)。ソース一式はここに配置。

/***
 * ファイル名: char_stream.h
 */
#ifndef CHAR_STREAM_H
#define CHAR_STREAM_H

// 文字列を文字ストリームとして扱うためのクラス
class CharStream {
public:
  CharStream(const char* str) : cur(str) {}
  unsigned char read() { return *cur++; }     
  unsigned char prev() const { return cur[-1]; }
  unsigned char peek() const { return *cur; } 
  const char*   rest() const { return cur; }

private:
  const char* cur;
};

#endif

/***
 * ファイル名: char_stream_vector.h
 */
#ifndef CHAR_STREAM_VECTOR_H
#define CHAR_STREAM_VECTOR_H

#include <vector>
#include <cstdio>
#include <cstring>
#include "char_stream.h"

// 改行('\n')区切りのファイルを、キーリスト(キー文字ストリームリスト)として扱うためのクラス
class CharStreamVector {
public:
  CharStreamVector(const char* filepath) 
    : buf(NULL), valid(false) {
    FILE* f;
    if((f=fopen(filepath,"rb"))==NULL)
      return;

    // ファイルサイズ取得
    fseek(f,0,SEEK_END);
    long file_size = ftell(f);  // XXX: 最大2GB
    fseek(f,0,SEEK_SET);
    
    if(file_size != -1) {
      // ファイルの中身を全部読み込む
      buf = new char[file_size+1];
      fread(buf, sizeof(char), file_size, f);
      buf[file_size]='\0';
      
      if(buf[file_size-1]=='\n')
        buf[file_size-1]='\0';

      init(buf);   // 初期化
      
      valid=true;
    }
    fclose(f);
  }
  ~CharStreamVector() {
    delete [] buf;
  }

  CharStream& operator[] (unsigned index) { return words[index]; }
  unsigned size() const { return words.size(); }
  operator bool() const { return valid; }

private:
  void init(char* cur) {
    words.push_back(CharStream(cur));  
    for(cur=strchr(cur,'\n'); cur; cur=strchr(cur,'\n')) {
      *cur = '\0';                     
      cur++;
      words.push_back(CharStream(cur));
    }
  }

private:
  std::vector<CharStream> words;
  char* buf;
  bool valid;
};

#endif

/***
 * ファイル名: node_allocator.h
 */
#ifndef NODE_ALLOCATOR_H
#define NODE_ALLOCATOR_H

#include <vector>

// DoubleArray構築時に、使用可能なノード(空きノード)を管理するためのクラス
class NodeAllocator {
  typedef unsigned NodeIndex;
  typedef std::vector<unsigned char> UCharList;
  
private:
  // 空きノード用のリンクリストクラス
  struct ForwardLink {
    ForwardLink(NodeIndex i) : idx(i), next(NULL) {}
    
    NodeIndex append(NodeIndex beg_idx, unsigned num) {
      ForwardLink* cur=this;
      while(cur->next) cur=cur->next;
      
      for(NodeIndex i=beg_idx; i < beg_idx+num; i++)
        cur = cur->next = new ForwardLink(i);
      return beg_idx+num-1;
    }
    
    void remove_next() {
      ForwardLink* tmp=next; next=next->next; delete tmp;
    }
    
    void delete_all_tail() {
      for(ForwardLink* tmp=next; next; tmp=next) {
        next=next->next;
        delete tmp;
      }
    }
    
    NodeIndex    idx;   // 空きノード(インデックス)
    ForwardLink* next;  // 次のリンク
  };

public:
  static const unsigned PER_LINK_SIZE=0x10000;
  
public:
  NodeAllocator() : root(0) {
    last_idx = root.append(1,PER_LINK_SIZE);
  }
  ~NodeAllocator() {
    root.delete_all_tail();
  }
  
  int allocate(unsigned char code) {
    static UCharList codes(1);  // XXX: thread unsafe
    codes[0] = code;
    return allocate(codes);
  }
  
  int allocate(const UCharList& codes, ForwardLink* prev=NULL) {
    if(!prev) prev=&root;
    if(!prev->next)
      last_idx = prev->append(last_idx+1, PER_LINK_SIZE);  // 使用可能領域追加

    ForwardLink *cur = prev->next;
    unsigned char min_cd = codes.front();
    
    for(; cur->idx <= min_cd; prev=cur, cur=cur->next);
    for(; cur->next; prev=cur,cur=cur->next) {
      NodeIndex x = cur->idx - min_cd;
      if(can_allocate(cur, codes, x)) {
        alloc(prev,codes,x);
        return x;
      }
    }
    return allocate(codes, cur);  // 領域不足なので、再帰的に繰り返す
  }
  
private:
  bool can_allocate(ForwardLink* cur, const UCharList& codes, NodeIndex x) const {
    cur=cur->next;
    for(unsigned i=1;i < codes.size(); i++) {
      for(; cur && cur->idx < x+codes[i]; cur=cur->next);
      if(!cur || cur->idx > x+codes[i])
        return false;
    }
    return true;
  }
  
  void alloc(ForwardLink* prev, const UCharList& codes, NodeIndex x) {
    prev->remove_next();
    for(unsigned i=1; i < codes.size(); i++) {
      for(; prev->next->idx < x+codes[i]; prev=prev->next);
      prev->remove_next();
    }    
  }
  
private:
  ForwardLink root;
  NodeIndex   last_idx;
};

#endif

/***
 * ファイル名: builder.h
 */
#ifndef BUILDER_H
#define BUILDER_H

#include <cstdio>
#include "node_allocator.h"
#include "char_stream.h"
#include "char_stream_vector.h"

// DoubleArray構築クラス
class Builder {
public:
  Builder(const char* filepath) 
    : csv(filepath), id(0), node_size(csv.size()*15) { // XXX: ノード用の配列のサイズはテキトウに決め打ち (危険!!!)
    base = new int[node_size];
    chck = new int[node_size];
    for(int i=0; i < node_size; i++)
      chck[i] = -1;   // CHECK配列の初期値は-1としておく (0~255以外の値なら何でも良い)
  }

  ~Builder() {
    delete [] base;
    delete [] chck;
  }
  
  Builder& build() {
    build_impl(0, csv.size(), 0);
    return *this;
  }

  void save(const char* filepath) {
    FILE* f = fopen(filepath, "wb");
    
    // 検索時の範囲外アクセスを防ぐために、最後のノードの後ろに、0xFF分のパディングを設けておく
    //   - それ以降の領域は切り詰める
    // XXX: node_size-0xFF から node_size の間には未使用ノードしかないことが前提の処理
    if(node_size > 0xFF)
      while(chck[node_size-0xFF]==-1)
        node_size--;
    
    fwrite(&node_size, sizeof(unsigned), 1, f);
    fwrite(base, sizeof(int), node_size, f);
    fwrite(chck, sizeof(int), node_size, f);
    fclose(f);
  }

private:
  void build_impl(std::size_t beg, std::size_t end, int root_node) {
    if(end-beg == 1) {
      // キー末尾まで分岐がない
      for(; csv[beg].prev() != '\0'; csv[beg].read())
        root_node = set_node(root_node, alloc.allocate(csv[beg].peek()), csv[beg].peek());
      base[root_node] = --id;  // 末端のノードにはIDをセット
      return;
    }

    // 子ノードを集める
    std::vector<unsigned char> children;
    std::vector<std::size_t>   ranges;
    do {
      children.push_back(csv[beg].peek());
      ranges.push_back(beg);
      beg = end_of_same_node(csv, beg, end);
    } while (beg != end);
    ranges.push_back(end);

    // ベースノードを取得
    int base_node = alloc.allocate(children);

    // 子ノードを設定して、各子ノードに対して、再帰的に同様の処理を繰り返す
    for(std::size_t i=0; i < children.size(); i++)
      build_impl(ranges[i], ranges[i+1], set_node(root_node, base_node, children[i]));
  }

  int set_node(int node, int base_node, unsigned char child) {
    int next   = base_node + child;
    base[node] = base_node;
    chck[next] = node;
    return next;
  }

  unsigned end_of_same_node(CharStreamVector& csv, std::size_t beg, std::size_t end) {
    unsigned char ch = csv[beg].read();
    std::size_t cur  = beg+1;
    for(; cur < end && ch == csv[cur].peek(); cur++)
      csv[cur].read();
    return cur;
  }

private:
  CharStreamVector csv;
  int id;
  int node_size;
  int* base;
  int* chck;
  NodeAllocator alloc;
};

#endif

/***
 * ファイル名: double_array.h
 */
#ifndef DOUBLE_ARRAY_H
#define DOUBLE_ARRAY_H

#include <cstdio>
#include "char_stream.h"

// DoubleArray検索用のクラス
class DoubleArray {
public:
  DoubleArray(const char* filepath) {
    FILE* f = fopen(filepath, "rb");
    
    unsigned node_size;
    fread(&node_size, sizeof(unsigned), 1, f);
    base = new int [node_size];
    chck = new int [node_size];
    
    fread(base, sizeof(int), node_size, f);
    fread(chck, sizeof(int), node_size, f);
    fclose(f);
  }
  ~DoubleArray() {
    delete [] base;
    delete [] chck;
  }

  int search(const char* key) const {
    int node_index=0;
    CharStream in(key);
    for(;;) {
      int next_index = base[node_index] + in.read();
      int node       = base[next_index];
      if(chck[next_index] == node_index)
        if(node < 0) return -node;
        else         node_index = next_index;
      else
        return -1;  // キーが存在しない場合は-1を返す
    }
  }

  template<class Callback>
  void each_common_prefix(const char* key, Callback& fn) const {
    int node_index=0;
    CharStream in(key);
    for(unsigned offset=0;; offset++) {
      int terminal_index = base[node_index] + '\0';      // 途中一致があるかどうかを判定
      if(chck[terminal_index] == node_index) {
        fn(key, offset, -base[terminal_index]);          // コールバック呼び出し
        if(in.peek()=='\0')
          break;
      }

      int next_index = base[node_index] + in.read();
      if(chck[next_index] == node_index) {
        // assert(node >= 0);   上で終端チェックは行っているので、nodeがマイナスになることはないはず
        node_index = next_index;
      } else
        break;
    }    
  }

private:
  int *base;
  int *chck;
};

#endif

残りは、上で作成したクラスを利用したコマンド群。

/***
 * ファイル名: mkda.cc
 * コンパイル: g++ -O2 -o mkda mkda.cc
 *
 * DoubleArray構築コマンド
 */
#include <iostream>
#include "builder.h"

int main(int argc, char** argv) {
  if(argc != 3) {
    std::cerr << "Usage: mkda <index> <wordlist>" << std::endl;
    return 1;
  }

  Builder(argv[2]).build().save(argv[1]);
  return 0;
}

/***
 * ファイル名: da.cc
 * コンパイル: g++ -o da da.cc
 *
 * DoubleArray検索コマンド
 *  - 標準入力からキーを受け取り、common-prefix-searchを行った結果を出力する
 */
#include <iostream>
#include <string>
#include "double_array.h"

struct Callback {
  void operator()(const char* key, unsigned offset, int id) const {
    std::cout << " => " << id << "#" << std::string(key,offset) << std::endl;
  }
};

int main(int argc, char** argv) {
  if(argc != 2) {
    std::cerr << "Usage: da <index>" << std::endl;
    return 1;
  }
  
  DoubleArray da(argv[1]);
  std::string word;
  Callback fn;
  
  while(getline(std::cin, word)) 
    da.each_common_prefix(word.c_str(), fn);

  return 0;
}

/***
 * ファイル名: ckda.cc
 * コンパイル: g++ -O2 -o ckda ckda.cc
 *
 * DoubleArray検索/計時コマンド
 *  - 検索対象となるキーセットを含むファイルを引数にとり、検索失敗数および検索に要した時間を出力する
 */
#include <iostream>
#include <sys/time.h>
#include "double_array.h"
#include "char_stream_vector.h"

inline double gettime(){
  timeval tv;
  gettimeofday(&tv,NULL);
  return static_cast<double>(tv.tv_sec)+static_cast<double>(tv.tv_usec)/1000000.0;
}

int main(int argc, char** argv) {
  if(argc != 3) {
    std::cerr << "Usage: ckda <index> <wordlist>" << std::endl;
    return 1;
  }
  
  DoubleArray da(argv[1]);
  CharStreamVector csv(argv[2]);

  double beg_t = gettime();
  unsigned err = 0;
  
  for(unsigned i=0; i < csv.size(); i++)
    if(da.search(csv[i].rest()) == -1)
      err++;

  std::cerr << "Not Found: " << err << "/" << csv.size() << std::endl;
  std::cerr << "Elapsed: " << gettime()-beg_t << " sec" << std::endl << std::endl;

  return 0;
}

*1:使用可能なノードの検索/管理を行うアロケータクラスは若干特殊だけど。
　BASE配列とCHECK配列に4byteずつ割り当てる実装なら、(今回NodeAllocatorが管理しているような)リンクリストは、BASE配列とCHECK配列の未使用領域を流用して実装するのが一般的(?)だと思う。
　その場合の明確な利点は、余分な領域が不要なこと。
　ただし、そうするとコードが煩雑になり、また(DoubleArray自体、もしくはアロケート方法を)他の実装方法に変更したい場合の柔軟性が劣るので、今回はアロケート部分を完全に独立させている。