DoubleArray(3-1): TAIL配列圧縮

DoubleArrayの3-1。
今回はTAIL配列の圧縮を行う。

参考にした(かな?)のは、次の論文: 『ダブル配列におけるキャッシュの効率化』*1。

上の論文の中に「後方一致する接尾辞を併合することで、TAILを圧縮することができる」という記述があるが、今回はこの通りのことを行っている。

実装

;; 型定義
(deftype octet () '(unsigned-byte 8))

;; BASE配列のindexから、TAIL配列のindexを取得
(defun tail-index (base-index)
  (- @*base*#base-index))

;; 接尾文字列の長さを取得  ※ 終端文字(値)は、'1'
(defun tail-length (base-index &aux (index (tail-index base-index))) 
  (- (position 1 *tail* :start index) index))

;; TAIL配列へのポインタ(index)を保持するBASE配列のindexを集める
(defun collect-terminal-base-index ()
  (loop FOR i FROM 0 BELOW (length *base*)
        WHEN (minusp @*base*#i)  ; 負数なら、TAILへのポインタ(index)
        COLLECT i))

;; tail-from-end-=<とsharable?で共通して用いるループ
;; base-index1とbase-index2が指す接尾文字列を、それぞれ後ろ側から辿っている
(macrolet ((common-loop (&body body)
             `(let ((t1 (tail-index base-index1))
                    (t2 (tail-index base-index2)))
                (do ((i1 (1- (tail-length base-index1)) (1- i1))
                     (i2 (1- (tail-length base-index2)) (1- i2)))
                    ((minusp i1) t) 
                  ,@body))))

  ;; base-index1とbase-index2が指す接尾文字列を、末尾を起点に比較する
  (defun tail-from-end-=< (base-index1 base-index2)
    (common-loop
      (cond ((minusp i2) (return nil))
            ((< @*tail*#(+ t1 i1) @*tail*#(+ t2 i2)) (return t))
            ((> @*tail*#(+ t1 i1) @*tail*#(+ t2 i2)) (return nil)))))

  ;; base-index1(の接尾文字列)がbase-index2(の...)に包含されているか(共有可能か?)どうかを調べる
  (defun sharable? (base-index1 base-index2)
    (common-loop
      (when (or (minusp i2)
                (/= @*tail*#(+ t1 i1) @*tail*#(+ t2 i2)))
        (return nil)))))

;; 新しいTAIL配列に、(旧TAIL配列==*tail*の)tail-indexから始まる末尾文字列を追加する
(defun append-to-tail (new-tail tail-index)
  (do ((i tail-index (1+ i)))
      ((= 1 @*tail*#i) (vector-push-extend 1 new-tail))
    (vector-push-extend @*tail*#i new-tail)))


;; TAIL配列の圧縮を行う関数
;; 引数はdouble-array構造体
(defun shrink-tail (&optional (*da* *da*))
         ;; TAIL配列へのポインタ(index)を含むBASE配列のindexリスト
         ;; 併合処理を行いやすくするために、接尾文字列によって(逆順に)ソートしておく
  (let ((#1=base-indices 
         (sort (collect-terminal-base-index) (complement #'tail-from-end-=<)))

         ;; 新しいTAIL配列
        (new-tail (make-array 128 :fill-pointer 1 :adjustable t :element-type 'octet)))
    ;; base-indicesの要素を順番に処理
    (loop AS (bi share?) IN (cons `(,(first #1#) nil)
                                  (mapcar (lambda (cur prev) 
                                             `(,cur ,(sharable? cur prev)))
                                          (rest #1#) #1#)) DO
      (let ((fpos (fill-pointer new-tail)))
        (if share?
            ;; 共有可能なら、TAIL配列へのポインタを修正するだけにする
            (setf @*base*#bi (- (1+ (tail-length bi)) fpos)) ; 1+ は終端文字(値)用
          (progn 
            ;; 共有不可能なら、new-tailに末尾文字列を追加し、TAIL配列へのポインタも修正する
            (append-to-tail new-tail (tail-index bi))
            (setf @*base*#bi (- fpos))))))

    ;; TAIL配列更新
    (setf *tail* new-tail)
    'DONE))


;; 実行
> (shrink-tail *da*)
--> DONE

shrink-tail関数を適用することで、TAIL配列のサイズを半分くらいに圧縮できるようだ(もちろん、その時にTAIL配列の内容によって変わるので、あくまでも半分はおおまかな目安だが)。

*1:矢田晋，森田和宏，泓田正雄，平石亘，青江順一．ダブル配列におけるキャッシュの効率化． FIT2006．pp. 71-72．2006.