matita/matita/lib/tutorial/chapter8.ma

   1 (*
   2 Regular Expressions
   3
   4 We shall apply all the previous machinery to the study of regular languages
   5 and the constructions of the associated finite automata. *)
   6
   7 include "tutorial/chapter7.ma".
   8
   9 (* The type re of regular expressions over an alphabet $S$ is the smallest
  10 collection of objects generated by the following constructors: *)
  11
  12 inductive re (S: DeqSet) : Type[0] ≝
  13    z: re S                (* empty: ∅ *)
  14  | e: re S                (* epsilon: ϵ *)
  15  | s: S → re S            (* symbol: a *)
  16  | c: re S → re S → re S  (* concatenation: e1 · e2 *)
  17  | o: re S → re S → re S  (* plus: e1 + e2 *)
  18  | k: re S → re S.        (* kleene's star: e* *)
  19
  20 interpretation "re epsilon" 'epsilon = (e ?).
  21 interpretation "re or" 'plus a b = (o ? a b).
  22 interpretation "re cat" 'middot a b = (c ? a b).
  23 interpretation "re star" 'star a = (k ? a).
  24
  25 notation < "a" non associative with precedence 90 for @{ 'ps $a}.
  26 notation > "` term 90 a" non associative with precedence 90 for @{ 'ps $a}.
  27 interpretation "atom" 'ps a = (s ? a).
  28
  29 notation "`∅" non associative with precedence 90 for @{ 'empty }.
  30 interpretation "empty" 'empty = (z ?).
  31
  32 (* The language sem{e} associated with the regular expression e is inductively
  33 defined by the following function: *)
  34
  35 let rec in_l (S : DeqSet) (r : re S) on r : word S → Prop ≝
  36 match r with
  37 [ z ⇒ ∅
  38 | e ⇒ {ϵ}
  39 | s x ⇒ { (x::[]) }
  40 | c r1 r2 ⇒ (in_l ? r1) · (in_l ? r2)
  41 | o r1 r2 ⇒ (in_l ? r1) ∪ (in_l ? r2)
  42 | k r1 ⇒ (in_l ? r1) ^*].
  43
  44
  45 notation "\sem{term 19 E}" non associative with precedence 75 for @{'in_l $E}.
  46 interpretation "in_l" 'in_l E = (in_l ? E).
  47 interpretation "in_l mem" 'mem w l = (in_l ? l w).
  48
  49 lemma rsem_star : ∀S.∀r: re S. \sem{r^*} = \sem{r}^*.
  50 // qed.
  51
  52
  53 (*
  54 Pointed Regular expressions
  55
  56 We now introduce pointed regular expressions, that are the main tool we shall
  57 use for the construction of the automaton.
  58 A pointed regular expression is just a regular expression internally labelled
  59 with some additional points. Intuitively, points mark the positions inside the
  60 regular expression which have been reached after reading some prefix of
  61 the input string, or better the positions where the processing of the remaining
  62 string has to be started. Each pointed expression for $e$ represents a state of
  63 the {\em deterministic} automaton associated with $e$; since we obviously have
  64 only a finite number of possible labellings, the number of states of the automaton
  65 is finite.
  66
  67 Pointed regular expressions provide the tool for an algebraic revisitation of
  68 McNaughton and Yamada's algorithm for position automata, making the proof of its
  69 correctness, that is far from trivial, particularly clear and simple. In particular,
  70 pointed expressions offer an appealing alternative to Brzozowski's derivatives,
  71 avoiding their weakest point, namely the fact of being forced to quotient derivatives
  72 w.r.t. a suitable notion of equivalence in order to get a finite number of states
  73 (that is not essential for recognizing strings, but is crucial for comparing regular
  74 expressions).
  75
  76 Our main data structure is the notion of pointed item, that is meant whose purpose
  77 is to encode a set of positions inside a regular expression.
  78 The idea of formalizing pointers inside a data type by means of a labelled version
  79 of the data type itself is probably one of the first, major lessons learned in the
  80 formalization of the metatheory of programming languages. For our purposes, it is
  81 enough to mark positions preceding individual characters, so we shall have two kinds
  82 of characters •a (pp a) and a (ps a) according to the case a is pointed or not. *)
  83
  84 inductive pitem (S: DeqSet) : Type[0] ≝
  85    pz: pitem S                       (* empty *)
  86  | pe: pitem S                       (* epsilon *)
  87  | ps: S → pitem S                   (* symbol *)
  88  | pp: S → pitem S                   (* pointed sysmbol *)
  89  | pc: pitem S → pitem S → pitem S   (* concatenation *)
  90  | po: pitem S → pitem S → pitem S   (* plus *)
  91  | pk: pitem S → pitem S.            (* kleene's star *)
  92
  93 (* A pointed regular expression (pre) is just a pointed item with an additional
  94 boolean, that must be understood as the possibility to have a trailing point at
  95 the end of the expression. As we shall see, pointed regular expressions can be
  96 understood as states of a DFA, and the boolean indicates if
  97 the state is final or not. *)
  98
  99 definition pre ≝ λS.pitem S × bool.
 100
 101 interpretation "pitem star" 'star a = (pk ? a).
 102 interpretation "pitem or" 'plus a b = (po ? a b).
 103 interpretation "pitem cat" 'middot a b = (pc ? a b).
 104 notation < ".a" non associative with precedence 90 for @{ 'pp $a}.
 105 notation > "`. term 90 a" non associative with precedence 90 for @{ 'pp $a}.
 106 interpretation "pitem pp" 'pp a = (pp ? a).
 107 interpretation "pitem ps" 'ps a = (ps ? a).
 108 interpretation "pitem epsilon" 'epsilon = (pe ?).
 109 interpretation "pitem empty" 'empty = (pz ?).
 110
 111 (* The carrier $|i|$ of an item i is the regular expression obtained from i by
 112 removing all the points. Similarly, the carrier of a pointed regular expression
 113 is the carrier of its item. *)
 114
 115 let rec forget (S: DeqSet) (l : pitem S) on l: re S ≝
 116  match l with
 117   [ pz ⇒ z ? (* `∅ *)
 118   | pe ⇒ ϵ
 119   | ps x ⇒ `x
 120   | pp x ⇒ `x
 121   | pc E1 E2 ⇒ (forget ? E1) · (forget ? E2)
 122   | po E1 E2 ⇒ (forget ? E1) + (forget ? E2)
 123   | pk E ⇒ (forget ? E)^* ].
 124
 125 (* Already in the library
 126 notation "| term 19 C |" with precedence 70 for @{ 'card $C }.
 127 *)
 128 interpretation "forget" 'card a = (forget ? a).
 129
 130 lemma erase_dot : ∀S.∀e1,e2:pitem S. |e1 · e2| = c ? (|e1|) (|e2|).
 131 // qed.
 132
 133 lemma erase_plus : ∀S.∀i1,i2:pitem S.
 134   |i1 + i2| = |i1| + |i2|.
 135 // qed.
 136
 137 lemma erase_star : ∀S.∀i:pitem S.|i^*| = |i|^*.
 138 // qed.
 139
 140 (*
 141 Comparing items and pres
 142
 143 Items and pres are very concrete datatypes: they can be effectively compared,
 144 and enumerated. In particular, we can define a boolean equality beqitem and a proof
 145 beqitem_true that it refects propositional equality, enriching the set (pitem S)
 146 to a DeqSet. *)
 147
 148 let rec beqitem S (i1,i2: pitem S) on i1 ≝
 149   match i1 with
 150   [ pz ⇒ match i2 with [ pz ⇒ true | _ ⇒ false]
 151   | pe ⇒ match i2 with [ pe ⇒ true | _ ⇒ false]
 152   | ps y1 ⇒ match i2 with [ ps y2 ⇒ y1==y2 | _ ⇒ false]
 153   | pp y1 ⇒ match i2 with [ pp y2 ⇒ y1==y2 | _ ⇒ false]
 154   | po i11 i12 ⇒ match i2 with
 155     [ po i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
 156     | _ ⇒ false]
 157   | pc i11 i12 ⇒ match i2 with
 158     [ pc i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
 159     | _ ⇒ false]
 160   | pk i11 ⇒ match i2 with [ pk i21 ⇒ beqitem S i11 i21 | _ ⇒ false]
 161   ].
 162
 163 lemma beqitem_true: ∀S,i1,i2. iff (beqitem S i1 i2 = true) (i1 = i2).
 164 #S #i1 elim i1
 165   [#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
 166   |#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
 167   |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
 168     [>(\P H) // | @(\b (refl …))]
 169   |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
 170     [>(\P H) // | @(\b (refl …))]
 171   |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
 172    normalize #H destruct
 173     [cases (true_or_false (beqitem S i11 i21)) #H1
 174       [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
 175       |>H1 in H; normalize #abs @False_ind /2/
 176       ]
 177     |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
 178     ]
 179   |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
 180    normalize #H destruct
 181     [cases (true_or_false (beqitem S i11 i21)) #H1
 182       [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
 183       |>H1 in H; normalize #abs @False_ind /2/
 184       ]
 185     |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
 186     ]
 187   |#i3 #Hind #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i4] %
 188    normalize #H destruct
 189     [>(proj1 … (Hind i4) H) // |>(proj2 … (Hind i4) (refl …)) //]
 190   ]
 191 qed.
 192
 193 definition DeqItem ≝ λS.
 194   mk_DeqSet (pitem S) (beqitem S) (beqitem_true S).
 195
 196 (* We also add a couple of unification hints to allow the type inference system
 197 to look at (pitem S) as the carrier of a DeqSet, and at beqitem as if it was the
 198 equality function of a DeqSet. *)
 199
 200 unification hint  0 ≔ S;
 201     X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
 202 (* ---------------------------------------- *) ⊢
 203     pitem S ≡ carr X.
 204
 205 unification hint  0 ≔ S,i1,i2;
 206     X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
 207 (* ---------------------------------------- *) ⊢
 208     beqitem S i1 i2 ≡ eqb X i1 i2.
 209
 210 (*
 211 Semantics of pointed regular expressions
 212
 213 The intuitive semantic of a point is to mark the position where
 214 we should start reading the regular expression. The language associated
 215 to a pre is the union of the languages associated with its points. *)
 216
 217 let rec in_pl (S : DeqSet) (r : pitem S) on r : word S → Prop ≝
 218 match r with
 219 [ pz ⇒ ∅
 220 | pe ⇒ ∅
 221 | ps _ ⇒ ∅
 222 | pp x ⇒ { (x::[]) }
 223 | pc r1 r2 ⇒ (in_pl ? r1) · \sem{forget ? r2} ∪ (in_pl ? r2)
 224 | po r1 r2 ⇒ (in_pl ? r1) ∪ (in_pl ? r2)
 225 | pk r1 ⇒ (in_pl ? r1) · \sem{forget ? r1}^*  ].
 226
 227 interpretation "in_pl" 'in_l E = (in_pl ? E).
 228 interpretation "in_pl mem" 'mem w l = (in_pl ? l w).
 229
 230 definition in_prl ≝ λS : DeqSet.λp:pre S.
 231   if (snd ?? p) then \sem{fst ?? p} ∪ {ϵ} else \sem{fst ?? p}.
 232
 233 interpretation "in_prl mem" 'mem w l = (in_prl ? l w).
 234 interpretation "in_prl" 'in_l E = (in_prl ? E).
 235
 236 (* The following, trivial lemmas are only meant for rewriting purposes. *)
 237
 238 lemma sem_pre_true : ∀S.∀i:pitem S.
 239   \sem{〈i,true〉} = \sem{i} ∪ {ϵ}.
 240 // qed.
 241
 242 lemma sem_pre_false : ∀S.∀i:pitem S.
 243   \sem{〈i,false〉} = \sem{i}.
 244 // qed.
 245
 246 lemma sem_cat: ∀S.∀i1,i2:pitem S.
 247   \sem{i1 · i2} = \sem{i1} · \sem{|i2|} ∪ \sem{i2}.
 248 // qed.
 249
 250 lemma sem_cat_w: ∀S.∀i1,i2:pitem S.∀w.
 251   \sem{i1 · i2} w = ((\sem{i1} · \sem{|i2|}) w ∨ \sem{i2} w).
 252 // qed.
 253
 254 lemma sem_plus: ∀S.∀i1,i2:pitem S.
 255   \sem{i1 + i2} = \sem{i1} ∪ \sem{i2}.
 256 // qed.
 257
 258 lemma sem_plus_w: ∀S.∀i1,i2:pitem S.∀w.
 259   \sem{i1 + i2} w = (\sem{i1} w ∨ \sem{i2} w).
 260 // qed.
 261
 262 lemma sem_star : ∀S.∀i:pitem S.
 263   \sem{i^*} = \sem{i} · \sem{|i|}^*.
 264 // qed.
 265
 266 lemma sem_star_w : ∀S.∀i:pitem S.∀w.
 267   \sem{i^*} w = (∃w1,w2.w1 @ w2 = w ∧ \sem{i} w1 ∧ \sem{|i|}^* w2).
 268 // qed.
 269
 270 (* Below are a few, simple, semantic properties of items. In particular:
 271 - not_epsilon_item : ∀S:DeqSet.∀i:pitem S. ¬ (\sem{i} ϵ).
 272 - epsilon_pre : ∀S.∀e:pre S. (\sem{i} ϵ) ↔ (\snd e = true).
 273 - minus_eps_item: ∀S.∀i:pitem S. \sem{i} =1 \sem{i}-{[ ]}.
 274 - minus_eps_pre: ∀S.∀e:pre S. \sem{\fst e} =1 \sem{e}-{[ ]}.
 275 The first property is proved by a simple induction on $i$; the other
 276 results are easy corollaries. We need an auxiliary lemma first. *)
 277
 278 lemma append_eq_nil : ∀S.∀w1,w2:word S. w1 @ w2 = ϵ → w1 = ϵ.
 279 #S #w1 #w2 cases w1 // #a #tl normalize #H destruct qed.
 280
 281 lemma not_epsilon_lp : ∀S:DeqSet.∀e:pitem S. ¬ (ϵ ∈ e).
 282 #S #e elim e normalize
 283   [1,2,3:/2/
 284   |#x % #H destruct
 285   |#r1 #r2 * #n1 #n2 % * /2/ * #w1 * #w2 * * #H
 286    >(append_eq_nil …H…) /2/
 287   |#r1 #r2 #n1 #n2 % * /2/
 288   |#r #n % * #w1 * #w2 * * #H >(append_eq_nil …H…) /2/
 289   ]
 290 qed.
 291
 292 lemma epsilon_to_true : ∀S.∀e:pre S. ϵ ∈ e → snd … e = true.
 293 #S * #i #b cases b // normalize #H @False_ind /2/
 294 qed.
 295
 296 lemma true_to_epsilon : ∀S.∀e:pre S. snd … e = true → ϵ ∈ e.
 297 #S * #i #b #btrue normalize in btrue; >btrue %2 //
 298 qed.
 299
 300 lemma minus_eps_item: ∀S.∀i:pitem S. \sem{i} ≐ \sem{i}-{ϵ}.
 301 #S #i #w %
 302   [#H whd % // normalize @(not_to_not … (not_epsilon_lp …i)) //
 303   |* //
 304   ]
 305 qed.
 306
 307 lemma minus_eps_pre: ∀S.∀e:pre S. \sem{fst ?? e} ≐ \sem{e}-{ϵ}.
 308 #S * #i *
 309   [>sem_pre_true normalize in ⊢ (??%?); #w %
 310     [/3/ | * * // #H1 #H2 @False_ind @(absurd …H1 H2)]
 311   |>sem_pre_false normalize in ⊢ (??%?); #w % [ /3/ | * // ]
 312   ]
 313 qed.