matita/matita/lib/tutorial/chapter7.ma

   1 (*
   2 Regular Expressions
   3
   4 We shall apply all the previous machinery to the study of regular languages
   5 and the constructions of the associated finite automata. *)
   6
   7 include "tutorial/chapter6.ma".
   8
   9 (* The type re of regular expressions over an alphabet $S$ is the smallest
  10 collection of objects generated by the following constructors: *)
  11
  12 inductive re (S: DeqSet) : Type[0] ≝
  13    z: re S                (* empty: ∅ *)
  14  | e: re S                (* epsilon: ϵ *)
  15  | s: S → re S            (* symbol: a *)
  16  | c: re S → re S → re S  (* concatenation: e1 · e2 *)
  17  | o: re S → re S → re S  (* plus: e1 + e2 *)
  18  | k: re S → re S.        (* kleene's star: e* *)
  19
  20 interpretation "re epsilon" 'epsilon = (e ?).
  21 interpretation "re or" 'plus a b = (o ? a b).
  22 interpretation "re cat" 'middot a b = (c ? a b).
  23 interpretation "re star" 'star a = (k ? a).
  24
  25 notation < "a" non associative with precedence 90 for @{ 'ps $a}.
  26 notation > "` term 90 a" non associative with precedence 90 for @{ 'ps $a}.
  27 interpretation "atom" 'ps a = (s ? a).
  28
  29 notation "`∅" non associative with precedence 90 for @{ 'empty }.
  30 interpretation "empty" 'empty = (z ?).
  31
  32 (* The language sem{e} associated with the regular expression e is inductively
  33 defined by the following function: *)
  34
  35 let rec in_l (S : DeqSet) (r : re S) on r : word S → Prop ≝
  36 match r with
  37 [ z ⇒ ∅
  38 | e ⇒ {ϵ}
  39 | s x ⇒ { (x::[]) }
  40 | c r1 r2 ⇒ (in_l ? r1) · (in_l ? r2)
  41 | o r1 r2 ⇒ (in_l ? r1) ∪ (in_l ? r2)
  42 | k r1 ⇒ (in_l ? r1) ^*].
  43
  44 notation "\sem{term 19 E}" non associative with precedence 75 for @{'in_l $E}.
  45 interpretation "in_l" 'in_l E = (in_l ? E).
  46 interpretation "in_l mem" 'mem w l = (in_l ? l w).
  47
  48 lemma rsem_star : ∀S.∀r: re S. \sem{r^*} = \sem{r}^*.
  49 // qed.
  50
  51
  52 (*
  53 Pointed Regular expressions
  54
  55 We now introduce pointed regular expressions, that are the main tool we shall
  56 use for the construction of the automaton.
  57 A pointed regular expression is just a regular expression internally labelled
  58 with some additional points. Intuitively, points mark the positions inside the
  59 regular expression which have been reached after reading some prefix of
  60 the input string, or better the positions where the processing of the remaining
  61 string has to be started. Each pointed expression for $e$ represents a state of
  62 the {\em deterministic} automaton associated with $e$; since we obviously have
  63 only a finite number of possible labellings, the number of states of the automaton
  64 is finite.
  65
  66 Pointed regular expressions provide the tool for an algebraic revisitation of
  67 McNaughton and Yamada's algorithm for position automata, making the proof of its
  68 correctness, that is far from trivial, particularly clear and simple. In particular,
  69 pointed expressions offer an appealing alternative to Brzozowski's derivatives,
  70 avoiding their weakest point, namely the fact of being forced to quotient derivatives
  71 w.r.t. a suitable notion of equivalence in order to get a finite number of states
  72 (that is not essential for recognizing strings, but is crucial for comparing regular
  73 expressions).
  74
  75 Our main data structure is the notion of pointed item, that is meant whose purpose
  76 is to encode a set of positions inside a regular expression.
  77 The idea of formalizing pointers inside a data type by means of a labelled version
  78 of the data type itself is probably one of the first, major lessons learned in the
  79 formalization of the metatheory of programming languages. For our purposes, it is
  80 enough to mark positions preceding individual characters, so we shall have two kinds
  81 of characters •a (pp a) and a (ps a) according to the case a is pointed or not. *)
  82
  83 inductive pitem (S: DeqSet) : Type[0] ≝
  84    pz: pitem S                       (* empty *)
  85  | pe: pitem S                       (* epsilon *)
  86  | ps: S → pitem S                   (* symbol *)
  87  | pp: S → pitem S                   (* pointed sysmbol *)
  88  | pc: pitem S → pitem S → pitem S   (* concatenation *)
  89  | po: pitem S → pitem S → pitem S   (* plus *)
  90  | pk: pitem S → pitem S.            (* kleene's star *)
  91
  92 (* A pointed regular expression (pre) is just a pointed item with an additional
  93 boolean, that must be understood as the possibility to have a trailing point at
  94 the end of the expression. As we shall see, pointed regular expressions can be
  95 understood as states of a DFA, and the boolean indicates if
  96 the state is final or not. *)
  97
  98 definition pre ≝ λS.pitem S × bool.
  99
 100 interpretation "pitem star" 'star a = (pk ? a).
 101 interpretation "pitem or" 'plus a b = (po ? a b).
 102 interpretation "pitem cat" 'middot a b = (pc ? a b).
 103 notation < ".a" non associative with precedence 90 for @{ 'pp $a}.
 104 notation > "`. term 90 a" non associative with precedence 90 for @{ 'pp $a}.
 105 interpretation "pitem pp" 'pp a = (pp ? a).
 106 interpretation "pitem ps" 'ps a = (ps ? a).
 107 interpretation "pitem epsilon" 'epsilon = (pe ?).
 108 interpretation "pitem empty" 'empty = (pz ?).
 109
 110 (* The carrier $|i|$ of an item i is the regular expression obtained from i by
 111 removing all the points. Similarly, the carrier of a pointed regular expression
 112 is the carrier of its item. *)
 113
 114 let rec forget (S: DeqSet) (l : pitem S) on l: re S ≝
 115  match l with
 116   [ pz ⇒ z ? (* `∅ *)
 117   | pe ⇒ ϵ
 118   | ps x ⇒ `x
 119   | pp x ⇒ `x
 120   | pc E1 E2 ⇒ (forget ? E1) · (forget ? E2)
 121   | po E1 E2 ⇒ (forget ? E1) + (forget ? E2)
 122   | pk E ⇒ (forget ? E)^* ].
 123
 124 interpretation "forget" 'card a = (forget ? a).
 125
 126 lemma erase_dot : ∀S.∀e1,e2:pitem S. |e1 · e2| = c ? (|e1|) (|e2|).
 127 // qed.
 128
 129 lemma erase_plus : ∀S.∀i1,i2:pitem S.
 130   |i1 + i2| = |i1| + |i2|.
 131 // qed.
 132
 133 lemma erase_star : ∀S.∀i:pitem S.|i^*| = |i|^*.
 134 // qed.
 135
 136 (*
 137 Comparing items and pres
 138
 139 Items and pres are very concrete datatypes: they can be effectively compared,
 140 and enumerated. In particular, we can define a boolean equality beqitem and a proof
 141 beqitem_true that it refects propositional equality, enriching the set (pitem S)
 142 to a DeqSet. *)
 143
 144 let rec beqitem S (i1,i2: pitem S) on i1 ≝
 145   match i1 with
 146   [ pz ⇒ match i2 with [ pz ⇒ true | _ ⇒ false]
 147   | pe ⇒ match i2 with [ pe ⇒ true | _ ⇒ false]
 148   | ps y1 ⇒ match i2 with [ ps y2 ⇒ y1==y2 | _ ⇒ false]
 149   | pp y1 ⇒ match i2 with [ pp y2 ⇒ y1==y2 | _ ⇒ false]
 150   | po i11 i12 ⇒ match i2 with
 151     [ po i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
 152     | _ ⇒ false]
 153   | pc i11 i12 ⇒ match i2 with
 154     [ pc i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
 155     | _ ⇒ false]
 156   | pk i11 ⇒ match i2 with [ pk i21 ⇒ beqitem S i11 i21 | _ ⇒ false]
 157   ].
 158
 159 lemma beqitem_true: ∀S,i1,i2. iff (beqitem S i1 i2 = true) (i1 = i2).
 160 #S #i1 elim i1
 161   [#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
 162   |#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
 163   |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
 164     [>(\P H) // | @(\b (refl …))]
 165   |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
 166     [>(\P H) // | @(\b (refl …))]
 167   |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
 168    normalize #H destruct
 169     [cases (true_or_false (beqitem S i11 i21)) #H1
 170       [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
 171       |>H1 in H; normalize #abs @False_ind /2/
 172       ]
 173     |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
 174     ]
 175   |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
 176    normalize #H destruct
 177     [cases (true_or_false (beqitem S i11 i21)) #H1
 178       [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
 179       |>H1 in H; normalize #abs @False_ind /2/
 180       ]
 181     |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
 182     ]
 183   |#i3 #Hind #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i4] %
 184    normalize #H destruct
 185     [>(proj1 … (Hind i4) H) // |>(proj2 … (Hind i4) (refl …)) //]
 186   ]
 187 qed.
 188
 189 definition DeqItem ≝ λS.
 190   mk_DeqSet (pitem S) (beqitem S) (beqitem_true S).
 191
 192 (* We also add a couple of unification hints to allow the type inference system
 193 to look at (pitem S) as the carrier of a DeqSet, and at beqitem as if it was the
 194 equality function of a DeqSet. *)
 195
 196 unification hint  0 ≔ S;
 197     X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
 198 (* ---------------------------------------- *) ⊢
 199     pitem S ≡ carr X.
 200
 201 unification hint  0 ≔ S,i1,i2;
 202     X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
 203 (* ---------------------------------------- *) ⊢
 204     beqitem S i1 i2 ≡ eqb X i1 i2.
 205
 206 (*
 207 Semantics of pointed regular expressions
 208
 209 The intuitive semantic of a point is to mark the position where
 210 we should start reading the regular expression. The language associated
 211 to a pre is the union of the languages associated with its points. *)
 212
 213 let rec in_pl (S : DeqSet) (r : pitem S) on r : word S → Prop ≝
 214 match r with
 215 [ pz ⇒ ∅
 216 | pe ⇒ ∅
 217 | ps _ ⇒ ∅
 218 | pp x ⇒ { (x::[]) }
 219 | pc r1 r2 ⇒ (in_pl ? r1) · \sem{forget ? r2} ∪ (in_pl ? r2)
 220 | po r1 r2 ⇒ (in_pl ? r1) ∪ (in_pl ? r2)
 221 | pk r1 ⇒ (in_pl ? r1) · \sem{forget ? r1}^*  ].
 222
 223 interpretation "in_pl" 'in_l E = (in_pl ? E).
 224 interpretation "in_pl mem" 'mem w l = (in_pl ? l w).
 225
 226 definition in_prl ≝ λS : DeqSet.λp:pre S.
 227   if (\snd p) then \sem{\fst p} ∪ {ϵ} else \sem{\fst p}.
 228
 229 interpretation "in_prl mem" 'mem w l = (in_prl ? l w).
 230 interpretation "in_prl" 'in_l E = (in_prl ? E).
 231
 232 (* The following, trivial lemmas are only meant for rewriting purposes. *)
 233
 234 lemma sem_pre_true : ∀S.∀i:pitem S.
 235   \sem{〈i,true〉} = \sem{i} ∪ {ϵ}.
 236 // qed.
 237
 238 lemma sem_pre_false : ∀S.∀i:pitem S.
 239   \sem{〈i,false〉} = \sem{i}.
 240 // qed.
 241
 242 lemma sem_cat: ∀S.∀i1,i2:pitem S.
 243   \sem{i1 · i2} = \sem{i1} · \sem{|i2|} ∪ \sem{i2}.
 244 // qed.
 245
 246 lemma sem_cat_w: ∀S.∀i1,i2:pitem S.∀w.
 247   \sem{i1 · i2} w = ((\sem{i1} · \sem{|i2|}) w ∨ \sem{i2} w).
 248 // qed.
 249
 250 lemma sem_plus: ∀S.∀i1,i2:pitem S.
 251   \sem{i1 + i2} = \sem{i1} ∪ \sem{i2}.
 252 // qed.
 253
 254 lemma sem_plus_w: ∀S.∀i1,i2:pitem S.∀w.
 255   \sem{i1 + i2} w = (\sem{i1} w ∨ \sem{i2} w).
 256 // qed.
 257
 258 lemma sem_star : ∀S.∀i:pitem S.
 259   \sem{i^*} = \sem{i} · \sem{|i|}^*.
 260 // qed.
 261
 262 lemma sem_star_w : ∀S.∀i:pitem S.∀w.
 263   \sem{i^*} w = (∃w1,w2.w1 @ w2 = w ∧ \sem{i} w1 ∧ \sem{|i|}^* w2).
 264 // qed.
 265
 266 (* Below are a few, simple, semantic properties of items. In particular:
 267 - not_epsilon_item : ∀S:DeqSet.∀i:pitem S. ¬ (\sem{i} ϵ).
 268 - epsilon_pre : ∀S.∀e:pre S. (\sem{i} ϵ) ↔ (\snd e = true).
 269 - minus_eps_item: ∀S.∀i:pitem S. \sem{i} =1 \sem{i}-{[ ]}.
 270 - minus_eps_pre: ∀S.∀e:pre S. \sem{\fst e} =1 \sem{e}-{[ ]}.
 271 The first property is proved by a simple induction on $i$; the other
 272 results are easy corollaries. We need an auxiliary lemma first. *)
 273
 274 lemma append_eq_nil : ∀S.∀w1,w2:word S. w1 @ w2 = ϵ → w1 = ϵ.
 275 #S #w1 #w2 cases w1 // #a #tl normalize #H destruct qed.
 276
 277 lemma not_epsilon_lp : ∀S:DeqSet.∀e:pitem S. ¬ (ϵ ∈ e).
 278 #S #e elim e normalize /2/
 279   [#r1 #r2 * #n1 #n2 % * /2/ * #w1 * #w2 * * #H
 280    >(append_eq_nil …H…) /2/
 281   |#r1 #r2 #n1 #n2 % * /2/
 282   |#r #n % * #w1 * #w2 * * #H >(append_eq_nil …H…) /2/
 283   ]
 284 qed.
 285
 286 lemma epsilon_to_true : ∀S.∀e:pre S. ϵ ∈ e → \snd e = true.
 287 #S * #i #b cases b // normalize #H @False_ind /2/
 288 qed.
 289
 290 lemma true_to_epsilon : ∀S.∀e:pre S. \snd e = true → ϵ ∈ e.
 291 #S * #i #b #btrue normalize in btrue; >btrue %2 //
 292 qed.
 293
 294 lemma minus_eps_item: ∀S.∀i:pitem S. \sem{i} =1 \sem{i}-{[ ]}.
 295 #S #i #w %
 296   [#H whd % // normalize @(not_to_not … (not_epsilon_lp …i)) //
 297   |* //
 298   ]
 299 qed.
 300
 301 lemma minus_eps_pre: ∀S.∀e:pre S. \sem{\fst e} =1 \sem{e}-{[ ]}.
 302 #S * #i *
 303   [>sem_pre_true normalize in ⊢ (??%?); #w %
 304     [/3/ | * * // #H1 #H2 @False_ind @(absurd …H1 H2)]
 305   |>sem_pre_false normalize in ⊢ (??%?); #w % [ /3/ | * // ]
 306   ]
 307 qed.