weblib/tutorial/chapter7.ma

   1 (* <h1>Regular Expressions</h1>
   2 We shall apply all the previous machinery to the study of regular languages
   3 and the constructions of the associated finite automata. *)
   4
   5 include "tutorial/chapter6.ma".
   6
   7 (* The type re of regular expressions over an alphabet $S$ is the smallest
   8 collection of objects generated by the following constructors: *)
   9
  10 inductive re (S: \ 5a href="cic:/matita/tutorial/chapter4/DeqSet.ind(1,0,0)"\ 6DeqSet\ 5/a\ 6) : Type[0] ≝
  11    z: re S                (* empty: ∅ *)
  12  | e: re S                (* epsilon: ϵ *)
  13  | s: S → re S            (* symbol: a *)
  14  | c: re S → re S → re S  (* concatenation: e1 · e2 *)
  15  | o: re S → re S → re S  (* plus: e1 + e2 *)
  16  | k: re S → re S.        (* kleene's star: e* *)
  17
  18 interpretation "re epsilon" 'epsilon = (e ?).
  19 interpretation "re or" 'plus a b = (o ? a b).
  20 interpretation "re cat" 'middot a b = (c ? a b).
  21 interpretation "re star" 'star a = (k ? a).
  22
  23 notation < "a" non associative with precedence 90 for @{ 'ps $a}.
  24 notation > "` term 90 a" non associative with precedence 90 for @{ 'ps $a}.
  25 interpretation "atom" 'ps a = (s ? a).
  26
  27 notation "`∅" non associative with precedence 90 for @{ 'empty }.
  28 interpretation "empty" 'empty = (z ?).
  29
  30 (* The language sem{e} associated with the regular expression e is inductively
  31 defined by the following function: *)
  32
  33 let rec in_l (S : \ 5a href="cic:/matita/tutorial/chapter4/DeqSet.ind(1,0,0)"\ 6DeqSet\ 5/a\ 6\ 5span class="error" title="Parse error: RPAREN expected after [term] (in [arg])"\ 6\ 5/span\ 6) (r : \ 5a href="cic:/matita/tutorial/chapter7/re.ind(1,0,1)"\ 6re\ 5/a\ 6 S) on r : \ 5a href="cic:/matita/tutorial/chapter6/word.def(3)"\ 6word\ 5/a\ 6\ 5span class="error" title="Parse error: SYMBOL '≝' expected (in [let_defs])"\ 6\ 5/span\ 6 S → Prop ≝
  34 match r with
  35 [ z ⇒ \ 5a title="empty set" href="cic:/fakeuri.def(1)"\ 6∅\ 5/a\ 6\ 5span class="error" title="Parse error: SYMBOL '|' or SYMBOL ']' expected (in [term])"\ 6\ 5/span\ 6
  36 | e ⇒ \ 5a title="singleton" href="cic:/fakeuri.def(1)"\ 6{\ 5/a\ 6\ 5a title="epsilon" href="cic:/fakeuri.def(1)"\ 6ϵ\ 5/a\ 6}
  37 | s x ⇒ \ 5a title="singleton" href="cic:/fakeuri.def(1)"\ 6{\ 5/a\ 6\ 5span class="error" title="Parse error: [ident] or [term level 19] expected after [sym{] (in [term])"\ 6\ 5/span\ 6\ 5a title="cons" href="cic:/fakeuri.def(1)"\ 6[\ 5/a\ 6\ 5span class="error" title="Parse error: [term] expected after [sym[] (in [term])"\ 6\ 5/span\ 6x]}
  38 | c r1 r2 ⇒ (in_l ? r1) \ 5a title="cat lang" href="cic:/fakeuri.def(1)"\ 6·\ 5/a\ 6 (in_l ? r2)
  39 | o r1 r2 ⇒ (in_l ? r1) \ 5a title="union" href="cic:/fakeuri.def(1)"\ 6∪\ 5/a\ 6 (in_l ? r2)
  40 | k r1 ⇒ (in_l ? r1) \ 5a title="star lang" href="cic:/fakeuri.def(1)"\ 6^\ 5/a\ 6*].
  41
  42 notation "\sem{term 19 E}" non associative with precedence 75 for @{'in_l $E}.
  43 interpretation "in_l" 'in_l E = (in_l ? E).
  44 interpretation "in_l mem" 'mem w l = (in_l ? l w).
  45
  46 lemma rsem_star : ∀S.∀r: \ 5a href="cic:/matita/tutorial/chapter7/re.ind(1,0,1)"\ 6re\ 5/a\ 6 S. \ 5a title="in_l" href="cic:/fakeuri.def(1)"\ 6\sem\ 5/a\ 6{r\ 5a title="re star" href="cic:/fakeuri.def(1)"\ 6^\ 5/a\ 6*} \ 5a title="leibnitz's equality" href="cic:/fakeuri.def(1)"\ 6=\ 5/a\ 6 \ 5a title="in_l" href="cic:/fakeuri.def(1)"\ 6\sem\ 5/a\ 6{r}\ 5a title="star lang" href="cic:/fakeuri.def(1)"\ 6^\ 5/a\ 6*.
  47 // qed.
  48
  49
  50 (* <h2>Pointed Regular expressions </h2>
  51 We now introduce pointed regular expressions, that are the main tool we shall
  52 use for the construction of the automaton.
  53 A pointed regular expression is just a regular expression internally labelled
  54 with some additional points. Intuitively, points mark the positions inside the
  55 regular expression which have been reached after reading some prefix of
  56 the input string, or better the positions where the processing of the remaining
  57 string has to be started. Each pointed expression for $e$ represents a state of
  58 the {\em deterministic} automaton associated with $e$; since we obviously have
  59 only a finite number of possible labellings, the number of states of the automaton
  60 is finite.
  61
  62 Pointed regular expressions provide the tool for an algebraic revisitation of
  63 McNaughton and Yamada's algorithm for position automata, making the proof of its
  64 correctness, that is far from trivial, particularly clear and simple. In particular,
  65 pointed expressions offer an appealing alternative to Brzozowski's derivatives,
  66 avoiding their weakest point, namely the fact of being forced to quotient derivatives
  67 w.r.t. a suitable notion of equivalence in order to get a finite number of states
  68 (that is not essential for recognizing strings, but is crucial for comparing regular
  69 expressions).
  70
  71 Our main data structure is the notion of pointed item, that is meant whose purpose
  72 is to encode a set of positions inside a regular expression.
  73 The idea of formalizing pointers inside a data type by means of a labelled version
  74 of the data type itself is probably one of the first, major lessons learned in the
  75 formalization of the metatheory of programming languages. For our purposes, it is
  76 enough to mark positions preceding individual characters, so we shall have two kinds
  77 of characters •a (pp a) and a (ps a) according to the case a is pointed or not. *)
  78
  79 inductive pitem (S: \ 5a href="cic:/matita/tutorial/chapter4/DeqSet.ind(1,0,0)"\ 6DeqSet\ 5/a\ 6) : Type[0] ≝
  80    pz: pitem S                       (* empty *)
  81  | pe: pitem S                       (* epsilon *)
  82  | ps: S → pitem S                   (* symbol *)
  83  | pp: S → pitem S                   (* pointed sysmbol *)
  84  | pc: pitem S → pitem S → pitem S   (* concatenation *)
  85  | po: pitem S → pitem S → pitem S   (* plus *)
  86  | pk: pitem S → pitem S.            (* kleene's star *)
  87
  88 (* A pointed regular expression (pre) is just a pointed item with an additional
  89 boolean, that must be understood as the possibility to have a trailing point at
  90 the end of the expression. As we shall see, pointed regular expressions can be
  91 understood as states of a DFA, and the boolean indicates if
  92 the state is final or not. *)
  93
  94 definition pre ≝ λS.\ 5a href="cic:/matita/tutorial/chapter7/pitem.ind(1,0,1)"\ 6pitem\ 5/a\ 6 S \ 5a title="Product" href="cic:/fakeuri.def(1)"\ 6×\ 5/a\ 6 \ 5a href="cic:/matita/basics/bool/bool.ind(1,0,0)"\ 6bool\ 5/a\ 6.
  95
  96 interpretation "pitem star" 'star a = (pk ? a).
  97 interpretation "pitem or" 'plus a b = (po ? a b).
  98 interpretation "pitem cat" 'middot a b = (pc ? a b).
  99 notation < ".a" non associative with precedence 90 for @{ 'pp $a}.
 100 notation > "`. term 90 a" non associative with precedence 90 for @{ 'pp $a}.
 101 interpretation "pitem pp" 'pp a = (pp ? a).
 102 interpretation "pitem ps" 'ps a = (ps ? a).
 103 interpretation "pitem epsilon" 'epsilon = (pe ?).
 104 interpretation "pitem empty" 'empty = (pz ?).
 105
 106 (* The carrier $|i|$ of an item i is the regular expression obtained from i by
 107 removing all the points. Similarly, the carrier of a pointed regular expression
 108 is the carrier of its item. *)
 109
 110 let rec forget (S: DeqSet) (l : pitem S) on l: re S ≝
 111  match l with
 112   [ pz ⇒ `∅
 113   | pe ⇒ ϵ
 114   | ps x ⇒ `x
 115   | pp x ⇒ `x
 116   | pc E1 E2 ⇒ (forget ? E1) · (forget ? E2)
 117   | po E1 E2 ⇒ (forget ? E1) + (forget ? E2)
 118   | pk E ⇒ (forget ? E)^* ].
 119
 120 (* notation < "|term 19 e|" non associative with precedence 70 for @{'forget $e}.*)
 121 interpretation "forget" 'norm a = (forget ? a).
 122
 123 lemma erase_dot : ∀S.∀e1,e2:pitem S. |e1 · e2| = c ? (|e1|) (|e2|).
 124 // qed.
 125
 126 lemma erase_plus : ∀S.∀i1,i2:pitem S.
 127   |i1 + i2| = |i1| + |i2|.
 128 // qed.
 129
 130 lemma erase_star : ∀S.∀i:pitem S.|i^*| = |i|^*.
 131 // qed.
 132
 133 (* <h2>Comparing items and pres<h2>
 134 Items and pres are very concrete datatypes: they can be effectively compared,
 135 and enumerated. In particular, we can define a boolean equality beqitem and a proof
 136 beqitem_true that it refects propositional equality, enriching the set (pitem S)
 137 to a DeqSet. *)
 138
 139 let rec beqitem S (i1,i2: pitem S) on i1 ≝
 140   match i1 with
 141   [ pz ⇒ match i2 with [ pz ⇒ true | _ ⇒ false]
 142   | pe ⇒ match i2 with [ pe ⇒ true | _ ⇒ false]
 143   | ps y1 ⇒ match i2 with [ ps y2 ⇒ y1==y2 | _ ⇒ false]
 144   | pp y1 ⇒ match i2 with [ pp y2 ⇒ y1==y2 | _ ⇒ false]
 145   | po i11 i12 ⇒ match i2 with
 146     [ po i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
 147     | _ ⇒ false]
 148   | pc i11 i12 ⇒ match i2 with
 149     [ pc i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
 150     | _ ⇒ false]
 151   | pk i11 ⇒ match i2 with [ pk i21 ⇒ beqitem S i11 i21 | _ ⇒ false]
 152   ].
 153
 154 lemma beqitem_true: ∀S,i1,i2. iff (beqitem S i1 i2 = true) (i1 = i2).
 155 #S #i1 elim i1
 156   [#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
 157   |#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
 158   |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
 159     [>(\P H) // | @(\b (refl …))]
 160   |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
 161     [>(\P H) // | @(\b (refl …))]
 162   |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
 163    normalize #H destruct
 164     [cases (true_or_false (beqitem S i11 i21)) #H1
 165       [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
 166       |>H1 in H; normalize #abs @False_ind /2/
 167       ]
 168     |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
 169     ]
 170   |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
 171    normalize #H destruct
 172     [cases (true_or_false (beqitem S i11 i21)) #H1
 173       [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
 174       |>H1 in H; normalize #abs @False_ind /2/
 175       ]
 176     |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
 177     ]
 178   |#i3 #Hind #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i4] %
 179    normalize #H destruct
 180     [>(proj1 … (Hind i4) H) // |>(proj2 … (Hind i4) (refl …)) //]
 181   ]
 182 qed.
 183
 184 definition DeqItem ≝ λS.
 185   mk_DeqSet (pitem S) (beqitem S) (beqitem_true S).
 186
 187 (* We also add a couple of unification hints to allow the type inference system
 188 to look at (pitem S) as the carrier of a DeqSet, and at beqitem as if it was the
 189 equality function of a DeqSet. *)
 190
 191 unification hint  0 ≔ S;
 192     X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
 193 (* ---------------------------------------- *) ⊢
 194     pitem S ≡ carr X.
 195
 196 unification hint  0 ≔ S,i1,i2;
 197     X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
 198 (* ---------------------------------------- *) ⊢
 199     beqitem S i1 i2 ≡ eqb X i1 i2.
 200
 201 (* <h2>Semantics of pointed regular expression<h2>
 202 The intuitive semantic of a point is to mark the position where
 203 we should start reading the regular expression. The language associated
 204 to a pre is the union of the languages associated with its points. *)
 205
 206 let rec in_pl (S : DeqSet) (r : pitem S) on r : word S → Prop ≝
 207 match r with
 208 [ pz ⇒ ∅
 209 | pe ⇒ ∅
 210 | ps _ ⇒ ∅
 211 | pp x ⇒ { [x] }
 212 | pc r1 r2 ⇒ (in_pl ? r1) · \sem{forget ? r2} ∪ (in_pl ? r2)
 213 | po r1 r2 ⇒ (in_pl ? r1) ∪ (in_pl ? r2)
 214 | pk r1 ⇒ (in_pl ? r1) · \sem{forget ? r1}^*  ].
 215
 216 interpretation "in_pl" 'in_l E = (in_pl ? E).
 217 interpretation "in_pl mem" 'mem w l = (in_pl ? l w).
 218
 219 definition in_prl ≝ λS : DeqSet.λp:pre S.
 220   if (\snd p) then \sem{\fst p} ∪ {ϵ} else \sem{\fst p}.
 221
 222 interpretation "in_prl mem" 'mem w l = (in_prl ? l w).
 223 interpretation "in_prl" 'in_l E = (in_prl ? E).
 224
 225 (* The following, trivial lemmas are only meant for rewriting purposes. *)
 226
 227 lemma sem_pre_true : ∀S.∀i:pitem S.
 228   \sem{〈i,true〉} = \sem{i} ∪ {ϵ}.
 229 // qed.
 230
 231 lemma sem_pre_false : ∀S.∀i:pitem S.
 232   \sem{〈i,false〉} = \sem{i}.
 233 // qed.
 234
 235 lemma sem_cat: ∀S.∀i1,i2:pitem S.
 236   \sem{i1 · i2} = \sem{i1} · \sem{|i2|} ∪ \sem{i2}.
 237 // qed.
 238
 239 lemma sem_cat_w: ∀S.∀i1,i2:pitem S.∀w.
 240   \sem{i1 · i2} w = ((\sem{i1} · \sem{|i2|}) w ∨ \sem{i2} w).
 241 // qed.
 242
 243 lemma sem_plus: ∀S.∀i1,i2:pitem S.
 244   \sem{i1 + i2} = \sem{i1} ∪ \sem{i2}.
 245 // qed.
 246
 247 lemma sem_plus_w: ∀S.∀i1,i2:pitem S.∀w.
 248   \sem{i1 + i2} w = (\sem{i1} w ∨ \sem{i2} w).
 249 // qed.
 250
 251 lemma sem_star : ∀S.∀i:pitem S.
 252   \sem{i^*} = \sem{i} · \sem{|i|}^*.
 253 // qed.
 254
 255 lemma sem_star_w : ∀S.∀i:pitem S.∀w.
 256   \sem{i^*} w = (∃w1,w2.w1 @ w2 = w ∧ \sem{i} w1 ∧ \sem{|i|}^* w2).
 257 // qed.
 258
 259 (* Below are a few, simple, semantic properties of items. In particular:
 260 - not_epsilon_item : ∀S:DeqSet.∀i:pitem S. ¬ (\sem{i} ϵ).
 261 - epsilon_pre : ∀S.∀e:pre S. (\sem{i} ϵ) ↔ (\snd e = true).
 262 - minus_eps_item: ∀S.∀i:pitem S. \sem{i} =1 \sem{i}-{[ ]}.
 263 - minus_eps_pre: ∀S.∀e:pre S. \sem{\fst e} =1 \sem{e}-{[ ]}.
 264 The first property is proved by a simple induction on $i$; the other
 265 results are easy corollaries. We need an auxiliary lemma first. *)
 266
 267 lemma append_eq_nil : ∀S.∀w1,w2:word S. w1 @ w2 = ϵ → w1 = ϵ.
 268 #S #w1 #w2 cases w1 // #a #tl normalize #H destruct qed.
 269
 270 lemma not_epsilon_lp : ∀S:DeqSet.∀e:pitem S. ¬ (ϵ ∈ e).
 271 #S #e elim e normalize /2/
 272   [#r1 #r2 * #n1 #n2 % * /2/ * #w1 * #w2 * * #H
 273    >(append_eq_nil …H…) /2/
 274   |#r1 #r2 #n1 #n2 % * /2/
 275   |#r #n % * #w1 * #w2 * * #H >(append_eq_nil …H…) /2/
 276   ]
 277 qed.
 278
 279 lemma epsilon_to_true : ∀S.∀e:pre S. ϵ ∈ e → \snd e = true.
 280 #S * #i #b cases b // normalize #H @False_ind /2/
 281 qed.
 282
 283 lemma true_to_epsilon : ∀S.∀e:pre S. \snd e = true → ϵ ∈ e.
 284 #S * #i #b #btrue normalize in btrue; >btrue %2 //
 285 qed.
 286
 287 lemma minus_eps_item: ∀S.∀i:pitem S. \sem{i} =1 \sem{i}-{[ ]}.
 288 #S #i #w %
 289   [#H whd % // normalize @(not_to_not … (not_epsilon_lp …i)) //
 290   |* //
 291   ]
 292 qed.
 293
 294 lemma minus_eps_pre: ∀S.∀e:pre S. \sem{\fst e} =1 \sem{e}-{[ ]}.
 295 #S * #i *
 296   [>sem_pre_true normalize in ⊢ (??%?); #w %
 297     [/3/ | * * // #H1 #H2 @False_ind @(absurd …H1 H2)]
 298   |>sem_pre_false normalize in ⊢ (??%?); #w % [ /3/ | * // ]
 299   ]
 300 qed.
 301