ユニコード文字列をバイトストリームとして扱うためのパッケージ

タイトル通りのパッケージ。
実装の前に使用例。

;;;; sbcl-1.0.49
;; 例で使用する文字列(および対応するバイト列)
(sb-ext:string-to-octets "下書き")
--> #(228 184 139 230 155 184 227 129 141)

;; 作成
(defparameter *in* (octet-stream:make "下書き"))
--> *in*

;; 一バイト先読みする
(octet-stream:peek *in*)
--> 228

;; 一バイト読み込む
(octet-stream:read *in*)
--> 228

(octet-stream:read *in*)
--> 184

(octet-stream:peek *in*)
--> 139

;; 一バイト読み捨てる
(octet-stream:eat *in*)
--> #S(OCTET-STREAM::OCTET-STREAM
        :SRC "下書き" :POS 1
        :END 3 :CODE 26360
        :OCTET-POS 3 :OCTET-LEN 3)

(octet-stream:peek *in*)
--> 230

;; 終端判定
(octet-stream:eos? *in*)
--> NIL

実装:

(defpackage octet-stream
  (:use :common-lisp)
  (:shadow :common-lisp read peek position)
  (:export make
           read
           peek
           eos?
           eat
           position))
(in-package :octet-stream)

;;;;;;;;;;;;;;;;
;;; declaration
(declaim (inline make-octet-stream make eos? octet-length peek read eat position))

;;;;;;;;;;;;;;;;;;;
;;; type definition
(deftype array-index () `(mod ,array-dimension-limit))
(deftype simple-characters () '(simple-array character))
(deftype unicode () `(mod ,char-code-limit))

;;;;;;;;;;;;;;;;
;;; octet-stream
(defstruct octet-stream
  (src      "" :type simple-characters)
  (pos       0 :type array-index)
  (end       0 :type array-index)
  (code      0 :type unicode)
  (octet-pos 0 :type (mod 5))
  (octet-len 0 :type (mod 5)))

;;;;;;;;;;;;;;;;;;;;;;
;;; auxiliary function
(defun octet-length (code)
  (declare (unicode code))
  (cond ((< code #x80)    1)
        ((< code #x800)   2)
        ((< code #x10000) 3)
        (t                4)))

;;;;;;;;;;;;;;;;;;;;;
;;; external function
(defun position (in)  ; 現在位置 (バイト単位ではなく文字単位)
  (octet-stream-pos in))

(defun make (string &key (start 0) (end (length string)))
  (declare (simple-characters string)
           (array-index start end))
  (let* ((code (if (= start (length string)) 
                  0
                (char-code (char string start))))
         (len (octet-length code)))
    (make-octet-stream :src string :pos start :end end
                       :code code :octet-pos len :octet-len len)))

(defun eos? (in)
  (with-slots (pos end) (the octet-stream in)
    (= pos end)))

;; 一バイト分先読みする。(eos? in)が真の時に呼び出された場合、返す値は未定義。
(defun peek (in)
  (with-slots (code octet-pos octet-len) (the octet-stream in)
    (if (= octet-pos octet-len)
        (case octet-len
            (1 code)
            (2 (+ #b11000000 (ldb (byte 5  6) code)))
            (3 (+ #b11100000 (ldb (byte 4 12) code)))
            (t (+ #b11110000 (ldb (byte 3 18) code))))
      (+ #b10000000 (ldb (byte 6 (* (the (mod 4) (1- octet-pos)) 6)) code)))))

(defun eat (in)
  (with-slots (src pos code octet-pos octet-len) (the octet-stream in)
    (decf octet-pos)
    (when (zerop octet-pos)
      (incf pos)
      (unless (eos? in)
        (setf code (char-code (char src pos))
              octet-len (octet-length code)
              octet-pos octet-len))))
  in)

(defun read (in)
  (prog1 (peek in)
    (eat in)))