tutorial/chapter8.ma

   1 (*
   2 Regular Expressions
   3
   4 We shall apply all the previous machinery to the study of regular languages
   5 and the constructions of the associated finite automata. *)
   6
   7 include "tutorial/chapter7.ma".
   8 include "basics/core_notation/card_1.ma".
   9
  10 (* The type re of regular expressions over an alphabet $S$ is the smallest
  11 collection of objects generated by the following constructors: *)
  12
  13 inductive re (S: DeqSet) : Type[0] ≝
  14    z: re S                (* empty: ∅ *)
  15  | e: re S                (* epsilon: ϵ *)
  16  | s: S → re S            (* symbol: a *)
  17  | c: re S → re S → re S  (* concatenation: e1 · e2 *)
  18  | o: re S → re S → re S  (* plus: e1 + e2 *)
  19  | k: re S → re S.        (* kleene's star: e* *)
  20
  21 interpretation "re epsilon" 'epsilon = (e ?).
  22 interpretation "re or" 'plus a b = (o ? a b).
  23 interpretation "re cat" 'middot a b = (c ? a b).
  24 interpretation "re star" 'star a = (k ? a).
  25
  26 notation < "a" non associative with precedence 90 for @{ 'ps $a}.
  27 notation > "` term 90 a" non associative with precedence 90 for @{ 'ps $a}.
  28 interpretation "atom" 'ps a = (s ? a).
  29
  30 notation "`∅" non associative with precedence 90 for @{ 'empty }.
  31 interpretation "empty" 'empty = (z ?).
  32
  33 (* The language sem{e} associated with the regular expression e is inductively
  34 defined by the following function: *)
  35
  36 let rec in_l (S : DeqSet) (r : re S) on r : word S → Prop ≝
  37 match r with
  38 [ z ⇒ ∅
  39 | e ⇒ {ϵ}
  40 | s x ⇒ { (x::[]) }
  41 | c r1 r2 ⇒ (in_l ? r1) · (in_l ? r2)
  42 | o r1 r2 ⇒ (in_l ? r1) ∪ (in_l ? r2)
  43 | k r1 ⇒ (in_l ? r1) ^*].
  44
  45
  46 notation "\sem{term 19 E}" non associative with precedence 75 for @{'in_l $E}.
  47 interpretation "in_l" 'in_l E = (in_l ? E).
  48 interpretation "in_l mem" 'mem w l = (in_l ? l w).
  49
  50 lemma rsem_star : ∀S.∀r: re S. \sem{r^*} = \sem{r}^*.
  51 // qed.
  52
  53
  54 (*
  55 Pointed Regular expressions
  56
  57 We now introduce pointed regular expressions, that are the main tool we shall
  58 use for the construction of the automaton.
  59 A pointed regular expression is just a regular expression internally labelled
  60 with some additional points. Intuitively, points mark the positions inside the
  61 regular expression which have been reached after reading some prefix of
  62 the input string, or better the positions where the processing of the remaining
  63 string has to be started. Each pointed expression for $e$ represents a state of
  64 the {\em deterministic} automaton associated with $e$; since we obviously have
  65 only a finite number of possible labellings, the number of states of the automaton
  66 is finite.
  67
  68 Pointed regular expressions provide the tool for an algebraic revisitation of
  69 McNaughton and Yamada's algorithm for position automata, making the proof of its
  70 correctness, that is far from trivial, particularly clear and simple. In particular,
  71 pointed expressions offer an appealing alternative to Brzozowski's derivatives,
  72 avoiding their weakest point, namely the fact of being forced to quotient derivatives
  73 w.r.t. a suitable notion of equivalence in order to get a finite number of states
  74 (that is not essential for recognizing strings, but is crucial for comparing regular
  75 expressions).
  76
  77 Our main data structure is the notion of pointed item, that is meant whose purpose
  78 is to encode a set of positions inside a regular expression.
  79 The idea of formalizing pointers inside a data type by means of a labelled version
  80 of the data type itself is probably one of the first, major lessons learned in the
  81 formalization of the metatheory of programming languages. For our purposes, it is
  82 enough to mark positions preceding individual characters, so we shall have two kinds
  83 of characters •a (pp a) and a (ps a) according to the case a is pointed or not. *)
  84
  85 inductive pitem (S: DeqSet) : Type[0] ≝
  86    pz: pitem S                       (* empty *)
  87  | pe: pitem S                       (* epsilon *)
  88  | ps: S → pitem S                   (* symbol *)
  89  | pp: S → pitem S                   (* pointed sysmbol *)
  90  | pc: pitem S → pitem S → pitem S   (* concatenation *)
  91  | po: pitem S → pitem S → pitem S   (* plus *)
  92  | pk: pitem S → pitem S.            (* kleene's star *)
  93
  94 (* A pointed regular expression (pre) is just a pointed item with an additional
  95 boolean, that must be understood as the possibility to have a trailing point at
  96 the end of the expression. As we shall see, pointed regular expressions can be
  97 understood as states of a DFA, and the boolean indicates if
  98 the state is final or not. *)
  99
 100 definition pre ≝ λS.pitem S × bool.
 101
 102 interpretation "pitem star" 'star a = (pk ? a).
 103 interpretation "pitem or" 'plus a b = (po ? a b).
 104 interpretation "pitem cat" 'middot a b = (pc ? a b).
 105 notation < ".a" non associative with precedence 90 for @{ 'pp $a}.
 106 notation > "`. term 90 a" non associative with precedence 90 for @{ 'pp $a}.
 107 interpretation "pitem pp" 'pp a = (pp ? a).
 108 interpretation "pitem ps" 'ps a = (ps ? a).
 109 interpretation "pitem epsilon" 'epsilon = (pe ?).
 110 interpretation "pitem empty" 'empty = (pz ?).
 111
 112 (* The carrier $|i|$ of an item i is the regular expression obtained from i by
 113 removing all the points. Similarly, the carrier of a pointed regular expression
 114 is the carrier of its item. *)
 115
 116 let rec forget (S: DeqSet) (l : pitem S) on l: re S ≝
 117  match l with
 118   [ pz ⇒ z ? (* `∅ *)
 119   | pe ⇒ ϵ
 120   | ps x ⇒ `x
 121   | pp x ⇒ `x
 122   | pc E1 E2 ⇒ (forget ? E1) · (forget ? E2)
 123   | po E1 E2 ⇒ (forget ? E1) + (forget ? E2)
 124   | pk E ⇒ (forget ? E)^* ].
 125
 126 (* Already in the library
 127 notation "| term 19 C |" with precedence 70 for @{ 'card $C }.
 128 *)
 129 interpretation "forget" 'card a = (forget ? a).
 130
 131 lemma erase_dot : ∀S.∀e1,e2:pitem S. |e1 · e2| = c ? (|e1|) (|e2|).
 132 // qed.
 133
 134 lemma erase_plus : ∀S.∀i1,i2:pitem S.
 135   |i1 + i2| = |i1| + |i2|.
 136 // qed.
 137
 138 lemma erase_star : ∀S.∀i:pitem S.|i^*| = |i|^*.
 139 // qed.
 140
 141 (*
 142 Comparing items and pres
 143
 144 Items and pres are very concrete datatypes: they can be effectively compared,
 145 and enumerated. In particular, we can define a boolean equality beqitem and a proof
 146 beqitem_true that it refects propositional equality, enriching the set (pitem S)
 147 to a DeqSet. *)
 148
 149 let rec beqitem S (i1,i2: pitem S) on i1 ≝
 150   match i1 with
 151   [ pz ⇒ match i2 with [ pz ⇒ true | _ ⇒ false]
 152   | pe ⇒ match i2 with [ pe ⇒ true | _ ⇒ false]
 153   | ps y1 ⇒ match i2 with [ ps y2 ⇒ y1==y2 | _ ⇒ false]
 154   | pp y1 ⇒ match i2 with [ pp y2 ⇒ y1==y2 | _ ⇒ false]
 155   | po i11 i12 ⇒ match i2 with
 156     [ po i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
 157     | _ ⇒ false]
 158   | pc i11 i12 ⇒ match i2 with
 159     [ pc i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
 160     | _ ⇒ false]
 161   | pk i11 ⇒ match i2 with [ pk i21 ⇒ beqitem S i11 i21 | _ ⇒ false]
 162   ].
 163
 164 lemma beqitem_true: ∀S,i1,i2. iff (beqitem S i1 i2 = true) (i1 = i2).
 165 #S #i1 elim i1
 166   [#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
 167   |#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
 168   |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
 169     [>(\P H) // | @(\b (refl …))]
 170   |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
 171     [>(\P H) // | @(\b (refl …))]
 172   |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
 173    normalize #H destruct
 174     [cases (true_or_false (beqitem S i11 i21)) #H1
 175       [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
 176       |>H1 in H; normalize #abs @False_ind /2/
 177       ]
 178     |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
 179     ]
 180   |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
 181    normalize #H destruct
 182     [cases (true_or_false (beqitem S i11 i21)) #H1
 183       [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
 184       |>H1 in H; normalize #abs @False_ind /2/
 185       ]
 186     |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
 187     ]
 188   |#i3 #Hind #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i4] %
 189    normalize #H destruct
 190     [>(proj1 … (Hind i4) H) // |>(proj2 … (Hind i4) (refl …)) //]
 191   ]
 192 qed.
 193
 194 definition DeqItem ≝ λS.
 195   mk_DeqSet (pitem S) (beqitem S) (beqitem_true S).
 196
 197 (* We also add a couple of unification hints to allow the type inference system
 198 to look at (pitem S) as the carrier of a DeqSet, and at beqitem as if it was the
 199 equality function of a DeqSet. *)
 200
 201 unification hint  0 ≔ S;
 202     X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
 203 (* ---------------------------------------- *) ⊢
 204     pitem S ≡ carr X.
 205
 206 unification hint  0 ≔ S,i1,i2;
 207     X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
 208 (* ---------------------------------------- *) ⊢
 209     beqitem S i1 i2 ≡ eqb X i1 i2.
 210
 211 (*
 212 Semantics of pointed regular expressions
 213
 214 The intuitive semantic of a point is to mark the position where
 215 we should start reading the regular expression. The language associated
 216 to a pre is the union of the languages associated with its points. *)
 217
 218 let rec in_pl (S : DeqSet) (r : pitem S) on r : word S → Prop ≝
 219 match r with
 220 [ pz ⇒ ∅
 221 | pe ⇒ ∅
 222 | ps _ ⇒ ∅
 223 | pp x ⇒ { (x::[]) }
 224 | pc r1 r2 ⇒ (in_pl ? r1) · \sem{forget ? r2} ∪ (in_pl ? r2)
 225 | po r1 r2 ⇒ (in_pl ? r1) ∪ (in_pl ? r2)
 226 | pk r1 ⇒ (in_pl ? r1) · \sem{forget ? r1}^*  ].
 227
 228 interpretation "in_pl" 'in_l E = (in_pl ? E).
 229 interpretation "in_pl mem" 'mem w l = (in_pl ? l w).
 230
 231 definition in_prl ≝ λS : DeqSet.λp:pre S.
 232   if (snd ?? p) then \sem{fst ?? p} ∪ {ϵ} else \sem{fst ?? p}.
 233
 234 interpretation "in_prl mem" 'mem w l = (in_prl ? l w).
 235 interpretation "in_prl" 'in_l E = (in_prl ? E).
 236
 237 (* The following, trivial lemmas are only meant for rewriting purposes. *)
 238
 239 lemma sem_pre_true : ∀S.∀i:pitem S.
 240   \sem{〈i,true〉} = \sem{i} ∪ {ϵ}.
 241 // qed.
 242
 243 lemma sem_pre_false : ∀S.∀i:pitem S.
 244   \sem{〈i,false〉} = \sem{i}.
 245 // qed.
 246
 247 lemma sem_cat: ∀S.∀i1,i2:pitem S.
 248   \sem{i1 · i2} = \sem{i1} · \sem{|i2|} ∪ \sem{i2}.
 249 // qed.
 250
 251 lemma sem_cat_w: ∀S.∀i1,i2:pitem S.∀w.
 252   \sem{i1 · i2} w = ((\sem{i1} · \sem{|i2|}) w ∨ \sem{i2} w).
 253 // qed.
 254
 255 lemma sem_plus: ∀S.∀i1,i2:pitem S.
 256   \sem{i1 + i2} = \sem{i1} ∪ \sem{i2}.
 257 // qed.
 258
 259 lemma sem_plus_w: ∀S.∀i1,i2:pitem S.∀w.
 260   \sem{i1 + i2} w = (\sem{i1} w ∨ \sem{i2} w).
 261 // qed.
 262
 263 lemma sem_star : ∀S.∀i:pitem S.
 264   \sem{i^*} = \sem{i} · \sem{|i|}^*.
 265 // qed.
 266
 267 lemma sem_star_w : ∀S.∀i:pitem S.∀w.
 268   \sem{i^*} w = (∃w1,w2.w1 @ w2 = w ∧ \sem{i} w1 ∧ \sem{|i|}^* w2).
 269 // qed.
 270
 271 (* Below are a few, simple, semantic properties of items. In particular:
 272 - not_epsilon_item : ∀S:DeqSet.∀i:pitem S. ¬ (\sem{i} ϵ).
 273 - epsilon_pre : ∀S.∀e:pre S. (\sem{i} ϵ) ↔ (\snd e = true).
 274 - minus_eps_item: ∀S.∀i:pitem S. \sem{i} ≐ \sem{i}-{[ ]}.
 275 - minus_eps_pre: ∀S.∀e:pre S. \sem{\fst e} ≐ \sem{e}-{[ ]}.
 276 The first property is proved by a simple induction on $i$; the other
 277 results are easy corollaries. We need an auxiliary lemma first. *)
 278
 279 lemma append_eq_nil : ∀S.∀w1,w2:word S. w1 @ w2 = ϵ → w1 = ϵ.
 280 #S #w1 #w2 cases w1 // #a #tl normalize #H destruct qed.
 281
 282 lemma not_epsilon_lp : ∀S:DeqSet.∀e:pitem S. ¬ (ϵ ∈ e).
 283 #S #e elim e normalize
 284   [1,2,3:/2/
 285   |#x % #H destruct
 286   |#r1 #r2 * #n1 #n2 % * /2/ * #w1 * #w2 * * #H
 287    >(append_eq_nil …H…) /2/
 288   |#r1 #r2 #n1 #n2 % * /2/
 289   |#r #n % * #w1 * #w2 * * #H >(append_eq_nil …H…) /2/
 290   ]
 291 qed.
 292
 293 lemma epsilon_to_true : ∀S.∀e:pre S. ϵ ∈ e → snd … e = true.
 294 #S * #i #b cases b // normalize #H @False_ind /2/
 295 qed.
 296
 297 lemma true_to_epsilon : ∀S.∀e:pre S. snd … e = true → ϵ ∈ e.
 298 #S * #i #b #btrue normalize in btrue; >btrue %2 //
 299 qed.
 300
 301 lemma minus_eps_item: ∀S.∀i:pitem S. \sem{i} ≐ \sem{i}-{ϵ}.
 302 #S #i #w %
 303   [#H whd % // normalize @(not_to_not … (not_epsilon_lp …i)) //
 304   |* //
 305   ]
 306 qed.
 307
 308 lemma minus_eps_pre: ∀S.∀e:pre S. \sem{fst ?? e} ≐ \sem{e}-{ϵ}.
 309 #S * #i *
 310   [>sem_pre_true normalize in ⊢ (??%?); #w %
 311     [/3/ | * * // #H1 #H2 @False_ind @(absurd …H1 H2)]
 312   |>sem_pre_false normalize in ⊢ (??%?); #w % [ /3/ | * // ]
 313   ]
 314 qed.