効率的な正規表現エンジンを Coq で検証する

　今年はそれらしきアドベントカレンダーが無いので，この記事は ML アドベントカレンダー 2020 の4日目の記事と言い張ってみます． ML って本を正せば定理証明支援系由来ですし．最後にちょっと OCaml を使ってますし．

　Coq によって検証された正規表現エンジンは既に存在していて，探せば他にもいくつもあると思うのですが，それらの実装は主として検証の簡単さに重きが置かれており，OCaml のコードを extract して使ってみると実用に堪えないものが多いように思います．本記事では，以前 OCaml で書いた効率的な実装を Coq に移植して検証します．加えて，extract によって OCaml のコードを再び生成し， AtCoder の問題に投げてみても TLE にならないことを確認します．

　本記事で実装する正規表現エンジンと，その正当性の証明は GitHub に存在します． Coq のコードを参照しながら記事を読みたい方はそちらを参照してください．

正規表現エンジンの移植

　本記事で Coq に移植する正規表現エンジン¹は Brzozowski derivative に基づくものです．

　Brzozowski derivative $a^{-1} R$ とは，とある正規表現 $R$ が受理する言語 $L(R)$ の先頭から，文字 $a$ を取り除いて得られる言語 $\{u \in\Sigma^*| au\in L(R)\}$ を受理する正規表現です．本記事でサポートする正規表現の構文——一切の文字列を受理しない $\emptyset$ ，空文字列だけを受理する $\epsilon$ ，文字 $b$ だけからなる文字列を受理する $b$ ， $R$ と $S$ の両方が受理する文字列を受理する $R \cap S$ ， $R$ または $S$ のどれかが受理する文字列を受理する $R \cup S$ ， $R$ の受理する文字列に続けて $S$ の受理する文字列が続く文字列を受理する $R S$ ， $R$ を0個以上繋げる $R^*$ ——については，以下のように systematic に Brzozowski derivative を求められます．

${\displaystyle \begin{array}{l} a^{-1} \emptyset = \emptyset \\ a^{-1} \epsilon = \emptyset \\ a^{-1} b = \left\{ \begin{array}{l} \epsilon~(a=b) \\ \emptyset~(\mbox{others})\\ \end{array} \right. \\ a^{-1} (R \cap S) = (a^{-1} R) \cap (a^{-1} S) \\ a^{-1} (R \cup S) = (a^{-1} R) \cup (a^{-1} S) \\ a^{-1} (R S) = \left\{ \begin{array}{l} (a^{-1} R) S \cup (a^{-1} S) ~(\epsilon \in L(R)) \\ (a^{-1} R) S~(\mbox{others})\\ \end{array} \right. \\ a^{-1} (R^*) = (a^{-1} R) R^* \\ \end{array} }$

　ある正規表現が空文字列を受理するかどうかは簡単に判定できますから，マッチしているか調べたい正規表現を，文字列に含まれる文字によって繰り返し微分し，最後に空文字列を受理するかどうか調べることで正規表現エンジンを実装していました．

　もっとも，user-contrib にある実装のように，正規表現の構文木をヴァリアントとして定義して，空文字列を受理するかの判定や微分を再帰関数で定義するのは効率的ではありません．

まず，微分しても正規表現の構文は全てが変化してしまう訳ではなく，一部は元の正規表現と一致しています．にもかかわらず，空文字列を受理するかの判定をナイーブに再帰関数としてしまうと，構文木全体に対して再計算が行われてしまいます²．

加えて，一切の文字列を受理しない $\emptyset$ は微分しても $\emptyset$ のままなので，適宜簡単化しないと構文木のサイズが爆発してしまいます．

　そのため，以前の実装は以下の3要素からなるレコードで正規表現を定義していました．

その正規表現が $\emptyset$ または $\epsilon$ かどうか
空文字列を受理するかどうか
文字を受け取って微分した正規表現を返す関数

OCaml のコードにすると以下の通りです．

type sort = EmptySet | EmptyStr | Other
type t = { sort : sort; is_nullable : bool; derive : char -> t }

空文字列を受理するかのメモ化や枝刈りがしたいだけであれば別に構文木を明示的に扱わないようにする必要はないのですが，新しい構文の追加が容易になるので shallow embedding にしています．

　ここで，この定義を Coq に移植する際に注意点があります．OCaml による $\emptyset$ の定義を見てみましょう．

let rec empty_set =
  { sort = EmptySet;
    is_nullable = false;
    derive = fun _ -> empty_set }

$\emptyset$ は空文字列を受理せず，微分すると $\emptyset$ 自身になるのでそれぞれの要素は良いのですが，レコード empty_set の定義に empty_set 自身が表れています．

このような構文を定義すべく，正規表現は余帰納的な，ストリームのようなデータ構造として定義しなくてはなりません． Coq だと帰納的なデータ型と余帰納的なデータ型は明確に区別されていますから，CoInductive を用いる必要がありますね．

Inductive sort :=
  | Other
  | EmptySet
  | EmptyStr.

CoInductive t := regexp (sort : sort) (is_nullable : bool) (derive : character -> t) : t.

せっかくなので，この定義に従って empty_set を移植するとどうなるか見てみましょう．

CoFixpoint empty_set := regexp EmptySet false (fun _ => empty_set).

余帰納的なデータ構造を余再帰で作る形になるので，OCaml においては単なる let rec ですが， Fixpoint ではなく CoFixpoint を使う必要があります．

　もっとも， $\epsilon$ を特別扱いして簡約する動機は $\emptyset$ に比べて薄いですし， $\emptyset$ が空文字列を受理しないことと微分すると $\emptyset$ になることは明らかなので，実際の Coq での定義では $\emptyset$ だけを特別扱いして別のコンストラクタとして扱っています³．

CoInductive t : Set :=
  | empty_set
  | regexp (is_nullable : bool) (derive : character -> t).

　Fixpoint ではなく CoFixpoint を使うことにさえ気を付けていれば大体の構文は移植できるのですが， $R S$ と $R^*$ に関しては注意が必要です．ナイーブにそれらを Coq に移植すると以下のような定義になると思いますが，これらは Coq の検査に弾かれてしまいます．

CoFixpoint app re1 re2 :=
  match re1, re2 with
  | empty_set, _ => empty_set
  | _, empty_set => empty_set
  | regexp is_nullable1 derive1,
    regexp is_nullable2 derive2 =>
    regexp (is_nullable1 && is_nullable2) (fun c =>
      if is_nullable1
      then union (app (derive1 c) re2) (derive2 c)
      else app (derive1 c) re2)
  end.

CoFixpoint star re :=
  match re with
  | empty_set => empty_str
  | regexp is_nullable derive =>
    regexp true (fun c => app (derive c) (star c))
  end.

これはどういうことかと言うと，Coq は無矛盾性のために有限時間で値を返さない再帰を許さないことで有名ですが，同様に有限時間でコンストラクタを生成しないような余再帰的定義⁴をさせないために，余再帰呼び出しによる返り値をパターンマッチすることを禁じているのです． $R S$ の実装である app はもちろん， $R \cup S$ の実装である union も $\emptyset$ を簡約するために引数のパターンマッチを行っていますから，productivity を満たしてないぞってめっちゃ怒られる訳ですね．

　この問題を解決するには余再帰呼び出しの返り値を union とか app とかに渡さなければ良いので，少々強引ですが union と app をひとまとめにしたものや，app と star をひとまとめにしたものを余再帰的に定義することにします．

(* union (app re1 re2) re3 *)
CoFixpoint union_app re1 re2 re3 :=
  match re1, re2 with
  | empty_set, _ => re3
  | _, empty_set => re3
  | regexp is_nullable1 derive1, regexp is_nullable2 derive2 =>
    regexp (is_nullable1 && is_nullable2 || is_nullable re3) (fun c =>
      if is_nullable1
      then union_app (derive1 c) re2 (union (derive2 c) (derive re3 c))
      else union_app (derive1 c) re2 (derive re3 c))
  end.

(* app re1 (star re2) *)
CoFixpoint app_star re1 re2 :=
  match re1 with
  | empty_set => empty_set
  | regexp is_nullable1 derive1 =>
    regexp is_nullable1 (fun c =>
      if is_nullable1
      then app_star (union (derive1 c) (derive re2 c)) re2
      else app_star (derive1 c) re2)
  end.

実際に Coq にお伺いを立てれば分かるんですが，これらの定義は余再帰呼び出しの後でパターンマッチをしていないので怒られなくて，肝心の app と star は先ほど定義した余再帰的関数の特殊ケースとして実装できます．

Definition app re1 re2 := union_app re1 re2 empty_set.
Definition star := app_star empty_str.

　そういえば，正規表現の各種構文の話はしたのに，肝心の正規表現のマッチングの実装の話をしていませんでしたね．と言っても実装は本当に簡単で，文字列の中身を左から順番に見ながら正規表現を微分していって，最後に空文字列を受理するか確かめるだけです．

Fixpoint matches re s :=
  match re, s with
  | empty_set, _ => false
  | regexp is_nullable _, nil => is_nullable
  | regexp _ derive, c :: s => matches (derive c) s
  end.

正規表現エンジンの検証

　OCaml で実装していた正規表現エンジンを Coq に移植できたので，いよいよその正当性の証明に移ることができます．

　ここで，今回の正規表現エンジンは shallow embedding で実装しているので，deep embedding の時とは一風変わった検証方法になる点に注意が必要です．

プログラミング言語についての性質を Coq で検証したりしていると，ついつい構文を帰納的に定義して，その意味を帰納的な命題で定義したくなることでしょう．例えば「文字列が正規表現にマッチする」とかいう2項関係を帰納的に定義して，関数として定義した正規表現のマッチングが true を返すことが同値であることを証明する訳ですね．

ところが今回は shallow embedding なので，正規表現の構文ではなく，その正規表現が空文字列を受理するかどうかと微分結果だけが与えられます．よって「文字列が正規表現にマッチする」という2項関係を帰納的に定義していた頃には公理扱いだった，

「 $R$ にマッチして $S$ にもマッチする場合に限り $R \cap S$ にマッチする」

みたいな命題を今度は定理として証明していくことになります．具体的な証明は割と全部 straightforward なので GitHub の方を見てもらうことにして省略しますが，今回証明した正規表現エンジンの正当性のステートメントは以下の通りです．

Theorem empty_str_spec : forall s, matches empty_str s <-> s = nil.

Theorem char_spec c : forall s, matches (char c) s <-> s = cons c nil.

Theorem inter_spec : forall s re re',
  matches (inter re re') s = matches re s && matches re' s.

Theorem union_spec : forall s re re',
  matches (union re re') s = matches re s || matches re' s.

Lemma union_app_spec : forall s re1 re2 re3,
  matches (union_app re1 re2 re3) s <->
  ((exists s1 s2, s = s1 ++ s2 /\ matches re1 s1 /\ matches re2 s2) \/ matches re3 s).

Corollary app_spec s re1 re2 :
  matches (app re1 re2) s <->
  (exists s1 s2, s = s1 ++ s2 /\ matches re1 s1 /\ matches re2 s2).

Inductive matches_star re : list character -> Prop :=
  | matches_star_nil : matches_star re nil
  | matches_star_app s t :
      matches re s ->
      matches_star re t ->
      matches_star re (s ++ t).

Lemma app_star_spec : forall s re1 re2,
  matches (app_star re1 re2) s <->
  exists s1 s2, s = s1 ++ s2 /\ matches re1 s1 /\ matches_star re2 s2.

Corollary star_spec s re :
  matches (star re) s <-> matches_star re s.

Shallow embedding で実装したのもあって，定理が構文ごとに完全に独立していて面白いですね．新しい構文を追加したとしても，それ以外の構文について matcher が正しく動作することの証明は一切変更しなくても良くて大変拡張性が高いです．

正規表現エンジンの競プロでの使用

　さて，Coq で正規表現エンジンを検証する試みは数多く存在するにもかかわらず，本記事で題材として取り上げるに至ったのは，既存の実装から OCaml のコードを生成して実際に使ってみると現実的な速度で動かないことが原因でした．

本記事の実装がどの程度の速度で動作するのか確かめるには正規表現エンジンのベンチマークを使えば良いのでしょうが，正直僕は AtCoder 以外で OCaml を書かないので，Coq のコードから extract したコードを AtCoder の問題に投げてみて， TLE にならないことだけ確認して終わりにしたいと思います．

　それではまず，Coq による実装から OCaml のコードを生成しましょう．以下のように Vernacular コマンドを書いてやれば，OCaml に変換したコードを勝手に regexp.ml に書き出してくれます．

Extract Inductive bool => "bool" ["true" "false"].
Extract Inductive sumbool => "bool" ["true" "false"].
Extract Inductive list => "list" ["[]" "(::)"].
Extract Inlined Constant negb => "not".
Extract Inlined Constant andb => "(&&)".
Extract Inlined Constant orb => "(||)".
Extraction "regexp.ml" F.

生成された regexp.ml の中身は以下の通りです．

module type Char =
 sig
  type t

  val eq_dec : t -> t -> bool
 end

module F =
 functor (C:Char) ->
 struct
  type character = C.t

  type t = __t Lazy.t
  and __t =
  | Coq_empty_set
  | Coq_regexp of bool * (character -> t)

  (** val frob : t -> t **)

  let frob t0 =
    t0

  (** val matches : t -> character list -> bool **)

  let rec matches re s =
    match Lazy.force
    re with
    | Coq_empty_set -> false
    | Coq_regexp (is_nullable0, derive0) ->
      (match s with
       | [] -> is_nullable0
       | c::s0 -> matches (derive0 c) s0)

  (** val is_nullable : t -> bool **)

  let is_nullable re =
    match Lazy.force
    re with
    | Coq_empty_set -> false
    | Coq_regexp (is_nullable0, _) -> is_nullable0

  (** val derive : t -> character -> t **)

  let derive re =
    match Lazy.force
    re with
    | Coq_empty_set -> (fun _ -> lazy Coq_empty_set)
    | Coq_regexp (_, derive0) -> derive0

  (** val empty_str : t **)

  let empty_str =
    lazy (Coq_regexp (true, (fun _ -> lazy Coq_empty_set)))

  (** val char : C.t -> t **)

  let char c =
    lazy (Coq_regexp (false, (fun c' ->
      if C.eq_dec c c' then empty_str else lazy Coq_empty_set)))

  (** val inter : t -> t -> t **)

  let rec inter re re' =
    match Lazy.force
    re with
    | Coq_empty_set -> lazy Coq_empty_set
    | Coq_regexp (is_nullable0, derive0) ->
      (match Lazy.force
       re' with
       | Coq_empty_set -> lazy Coq_empty_set
       | Coq_regexp (is_nullable', derive') ->
         lazy (Coq_regexp (((&&) is_nullable0 is_nullable'), (fun c ->
           inter (derive0 c) (derive' c)))))

  (** val union : t -> t -> t **)

  let rec union re re' =
    match Lazy.force
    re with
    | Coq_empty_set -> re'
    | Coq_regexp (is_nullable0, derive0) ->
      (match Lazy.force
       re' with
       | Coq_empty_set -> re
       | Coq_regexp (is_nullable', derive') ->
         lazy (Coq_regexp (((||) is_nullable0 is_nullable'), (fun c ->
           union (derive0 c) (derive' c)))))

  (** val union_app : t -> t -> t -> t **)

  let rec union_app re1 re2 re3 =
    match Lazy.force
    re1 with
    | Coq_empty_set -> re3
    | Coq_regexp (is_nullable1, derive1) ->
      (match Lazy.force
       re2 with
       | Coq_empty_set -> re3
       | Coq_regexp (is_nullable2, derive2) ->
         lazy (Coq_regexp (((||) ((&&) is_nullable1 is_nullable2)
                             (is_nullable re3)), (fun c ->
           if is_nullable1
           then union_app (derive1 c) re2 (union (derive2 c) (derive re3 c))
           else union_app (derive1 c) re2 (derive re3 c)))))

  (** val app : t -> t -> t **)

  let app re1 re2 =
    union_app re1 re2 (lazy Coq_empty_set)

  (** val app_star : t -> t -> t **)

  let rec app_star re1 re2 =
    match Lazy.force
    re1 with
    | Coq_empty_set -> lazy Coq_empty_set
    | Coq_regexp (is_nullable1, derive1) ->
      lazy (Coq_regexp (is_nullable1, (fun c ->
        if is_nullable1
        then app_star (union (derive1 c) (derive re2 c)) re2
        else app_star (derive1 c) re2)))

  (** val star : t -> t **)

  let star =
    app_star empty_str
 end

どうせ関数でガードされてるんだから Lazy.t にする必要無いだろとか，思うところも無いわけでもないですが，概ね Coq の実装に沿ったコードが生成されています．

　それでは，この自動生成した OCaml のコードを使って AtCoder の問題を解いてみましょう．今回解いてみるのは，ABC049C - Daydreamという問題です．ここで問われているのは，単純に入力された文字列 $S$ が (dream|dreamer|erase|eraser)* にマッチするかだけなのですが⁵，入力 $S$ の長さは最大で $10^5$ 文字まで対応しないといけないので，計算量のオーダーが悪いと TLE になることでしょう．