ソート済みファイルをトライ木に見立ててレベル順(幅優先)探索を行う

以下のような内容のファイルがあるとする。

# ファイル名: sample.txt  ※ この部分はファイルに含まれない
ab
abc
abef
1
1248
126
13579

これは次のような構造を持つトライ木として考えることができる。※'_ROOT_'は仮想的なスーパールート

今回は、上のようなソート済みのファイルを入力とし、それに対応する木(トライ木)をレベル順(幅優先)に探索する、といったことを行う。

補助クラス

まずは、そのために必要な補助的なクラスを定義。

/***
 * ファイル名: char_stream.h
 * 概要:
 *  - 文字ストリーム
 *  - char* の軽いラッパークラス
 */
#ifndef CHAR_STREAM_H
#define CHAR_STREAM_H

class CharStream {
public:
  CharStream(const char* str) : cur(str) {}
  
  char read()   { return *cur++; }         // 一文字読み込み
  // void unread() { cur--; }                 // 一文字戻る
  char peek() const { return *cur; }       // 現在の文字取得
  // const char* rest() const { return cur; } // 文字ストリームの残りの部分を取得する
  bool eos()  const { return *cur=='\0'; } // 文字ストリーム終端判定

private:
  const char* cur;
};

#endif

/***
 * ファイル名: char_stream_vector.h
 * 概要:
 *  - CharStreamの配列
 *  - 改行区切りのファイルを読み込み、メモリ上にCharStreamの配列を作成する
 */
#ifndef CHAR_STREAM_VECTOR_H
#define CHAR_STREAM_VECTOR_H

#include <vector>
#include <cstdio>
#include <cstring>
#include "char_stream.h"

class CharStreamVector {
public:
  // ファイルのパスを受け取り、CharStreamの配列を作成する
  // '\n'で区切られた各行を一つの文字列(文字ストリーム)として解釈する
  CharStreamVector(const char* filepath) 
    : buf(NULL), valid(false) {
    FILE* f;
    if((f=fopen(filepath,"rb"))==NULL)
      return;

    // ファイルサイズ取得
    fseek(f,0,SEEK_END);
    long file_size = ftell(f); // XXX: long型の最大値よりも大きいファイルは扱えない
    fseek(f,0,SEEK_SET);
    
    if(file_size != -1) {
      // ファイルの内容を全部読み込む
      buf = new char[file_size+1];
      fread(buf, sizeof(char), file_size, f);
      buf[file_size]='\0';
      
      if(buf[file_size-1]=='\n')
        buf[file_size-1]='\0';    // 末尾が改行文字の場合はNULL文字に置換      

      // 初期化 = CharStreamの配列作成
      init(buf);

      // 初期化完了フラグをtrueに
      valid=true;
    }
    fclose(f);
  }
  
  CharStream& operator[] (unsigned index) { return words[index]; }
  unsigned size() const { return words.size(); }
  operator bool() const { return valid; }

private:
  // 初期化処理
  //  - 改行文字=='\n'が前提
  void init(char* cur) {
    words.push_back(CharStream(cur));    // 先頭を配列に追加
    for(cur=strchr(cur,'\n'); cur; cur=strchr(cur,'\n')) {
      *cur = '\0';                       // 改行文字をヌル文字に置換
      cur++;
      words.push_back(CharStream(cur));  // 次の行を配列に追加
    }
  }

private:
  std::vector<CharStream> words;
  char* buf;
  bool valid;
};

#endif

これで、ソート済みファイルの各行を文字ストリームとして扱えるようになった。

レベル順探索

基本的には普通のレベル順探索とほとんど変わらない。
※ CharStream.readで文字を読み進めることが、自動的にレベルを一つ深くすることに対応しているのが若干分かりにくいかも

/***
 * ファイル名: level_order.cc
 * 概要:
 *  - 引数に受け取ったソート済みファイルをトライ木に見立ててレベル順に探索する
 */
#include "char_stream_vector.h"
#include <iostream>
#include <deque>

// 範囲
// 同一ノードの始点と終点を表すために用いる
struct Range {
  Range(unsigned begin, unsigned end) : beg(begin), end(end) {}
  unsigned beg;
  unsigned end;
};

// 範囲+レベル(木の深さ)
// レベルは出力を見やすくするためにのみ用いる
struct RangeWithLevel : public Range {
  RangeWithLevel(unsigned begin, unsigned end, unsigned level) : Range(begin,end), level(level) {}
  unsigned level;
}; 

// 同じレベルで、同一の文字が連接する範囲を取得する
//  - 同じレベルで連接する同じ文字 == トライ木上の同一ノード
//  - つまり、ノードの終端を取得する処理
//  - next_siblingとかいう名前でも良いかも
//
// rは、親ノードの始点と終端を格納している 
unsigned end_of_same_node(CharStreamVector& csv, const Range& r) {
  char beg_ch = csv[r.beg].read();  // ノード(の文字)を取得した後、子ノードに遷移  ※1
  for(unsigned cur=r.beg+1; cur < r.end; cur++)
    if(beg_ch != csv[cur].peek())   // 同一ノードか?
      return cur;                   //   別のノードが始まったので、ここで終了
    else
      csv[cur].read();              //   子ノードに遷移
  return r.end;
}
/* ※1 文字を一つ読み進める == 親ノードから子ノードにポインタを移す */

// レベル順探索を行う
void level_order_traverse(CharStreamVector& csv) {
  unsigned id=0;
  std::deque<RangeWithLevel> que;

  que.push_back(RangeWithLevel(0, csv.size(), 0)); // スーパールート
  while(!que.empty()) {
    RangeWithLevel r = que.front();
    que.pop_front();
    
    // 今回は、ヌル文字はノードとして扱わない
    if(csv[r.beg].eos())
      r.beg++;

    // 子ノードを探索する
    while(r.beg < r.end) {
      // 子ノード(の文字)をID付きで出力: IDは探索順にインクリメント
      for(unsigned i=0; i < r.level; i++) 
        std::cout << " ";  // レベル(深さ)分だけ空白を出力
      std::cout << id++ << "# " << csv[r.beg].peek() << std::endl;
     
      // 現在の子ノードの終端 == 次の子ノードの始点、を取得する
      unsigned end = end_of_same_node(csv, r);
      que.push_back(RangeWithLevel(r.beg, end, r.level+1));  // キューに追加
      r.beg = end;
    }
  }
}

// メイン関数
int main(int argc, char** argv) {
  const char* filepath = argv[1];
  CharStreamVector csv(filepath);
  level_order_traverse(csv);
  return 0;
}

実行例

$ g++ -o level_order level_order.cc
$ ./level_order sample.txt
0# a
1# 1
 2# b
 3# 2
 4# 3
  5# c
  6# e
  7# 4
  8# 6
  9# 5
   10# f
   11# 8
   12# 7
    13# 9