-(*
-Regular Expressions
-
-We shall apply all the previous machinery to the study of regular languages
-and the constructions of the associated finite automata. *)
+(*
+Formal Languages
+In this chapter we shall apply our notion of DeqSet, and the list operations
+defined in Chapter 4 to formal languages. A formal language is an arbitrary set
+of words over a given alphabet, that we shall represent as a predicate over words.
+*)
include "tutorial/chapter6.ma".
-(* The type re of regular expressions over an alphabet $S$ is the smallest
-collection of objects generated by the following constructors: *)
+(* A word (or string) over an alphabet S is just a list of elements of S.*)
+definition word ≝ λS:DeqSet.list S.
-inductive re (S: DeqSet) : Type[0] ≝
- z: re S (* empty: ∅ *)
- | e: re S (* epsilon: ϵ *)
- | s: S → re S (* symbol: a *)
- | c: re S → re S → re S (* concatenation: e1 · e2 *)
- | o: re S → re S → re S (* plus: e1 + e2 *)
- | k: re S → re S. (* kleene's star: e* *)
+(* For any alphabet there is only one word of length 0, the empty word, which is
+denoted by ϵ .*)
-interpretation "re epsilon" 'epsilon = (e ?).
-interpretation "re or" 'plus a b = (o ? a b).
-interpretation "re cat" 'middot a b = (c ? a b).
-interpretation "re star" 'star a = (k ? a).
+notation "ϵ" non associative with precedence 90 for @{ 'epsilon }.
+interpretation "epsilon" 'epsilon = (nil ?).
-notation < "a" non associative with precedence 90 for @{ 'ps $a}.
-notation > "` term 90 a" non associative with precedence 90 for @{ 'ps $a}.
-interpretation "atom" 'ps a = (s ? a).
+(* The operation that consists in appending two words to form a new word, whose
+length is the sum of the lengths of the original words is called concatenation.
+String concatenation is just the append operation over lists, hence there is no
+point to define it. Similarly, many of its properties, such as the fact that
+concatenating a word with the empty word gives the original word, follow by
+general results over lists.
+*)
-notation "`∅" non associative with precedence 90 for @{ 'empty }.
-interpretation "empty" 'empty = (z ?).
+(*
+Operations over languages
-(* The language sem{e} associated with the regular expression e is inductively
-defined by the following function: *)
+Languages inherit all the basic operations for sets, namely union, intersection,
+complementation, substraction, and so on. In addition, we may define some new
+operations induced by string concatenation, and in particular the concatenation
+A · B of two languages A and B, the so called Kleene's star A* of A and the
+derivative of a language A w.r.t. a given character a. *)
-let rec in_l (S : DeqSet) (r : re S) on r : word S → Prop ≝
-match r with
-[ z ⇒ ∅
-| e ⇒ {ϵ}
-| s x ⇒ { (x::[]) }
-| c r1 r2 ⇒ (in_l ? r1) · (in_l ? r2)
-| o r1 r2 ⇒ (in_l ? r1) ∪ (in_l ? r2)
-| k r1 ⇒ (in_l ? r1) ^*].
+definition cat : ∀S,l1,l2,w.Prop ≝
+ λS.λl1,l2.λw:word S.∃w1,w2.w1 @ w2 = w ∧ l1 w1 ∧ l2 w2.
+(*
+notation "a · b" non associative with precedence 60 for @{ 'middot $a $b}.
+*)
+interpretation "cat lang" 'middot a b = (cat ? a b).
-notation "\sem{term 19 E}" non associative with precedence 75 for @{'in_l $E}.
-interpretation "in_l" 'in_l E = (in_l ? E).
-interpretation "in_l mem" 'mem w l = (in_l ? l w).
+(* Given a language l, the Kleene's star of l, denoted by l*, is the set of
+finite-length strings that can be generated by concatenating arbitrary strings of
+l. In other words, w belongs to l* is and only if there exists a list of strings
+w1,w2,...wk all belonging to l, such that l = w1w2...wk.
+We need to define the latter operations. The following flatten function takes in
+input a list of words and concatenates them together. *)
-lemma rsem_star : ∀S.∀r: re S. \sem{r^*} = \sem{r}^*.
-// qed.
+(* Already in the library
+let rec flatten (S : DeqSet) (l : list (word S)) on l : word S ≝
+match l with [ nil ⇒ [ ] | cons w tl ⇒ w @ flatten ? tl ].
+*)
+(* Given a list of words l and a language r, (conjunct l r) is true if and only
+if all words in l are in r, that is for every w in l, r w holds. *)
-(*
-Pointed Regular expressions
-
-We now introduce pointed regular expressions, that are the main tool we shall
-use for the construction of the automaton.
-A pointed regular expression is just a regular expression internally labelled
-with some additional points. Intuitively, points mark the positions inside the
-regular expression which have been reached after reading some prefix of
-the input string, or better the positions where the processing of the remaining
-string has to be started. Each pointed expression for $e$ represents a state of
-the {\em deterministic} automaton associated with $e$; since we obviously have
-only a finite number of possible labellings, the number of states of the automaton
-is finite.
-
-Pointed regular expressions provide the tool for an algebraic revisitation of
-McNaughton and Yamada's algorithm for position automata, making the proof of its
-correctness, that is far from trivial, particularly clear and simple. In particular,
-pointed expressions offer an appealing alternative to Brzozowski's derivatives,
-avoiding their weakest point, namely the fact of being forced to quotient derivatives
-w.r.t. a suitable notion of equivalence in order to get a finite number of states
-(that is not essential for recognizing strings, but is crucial for comparing regular
-expressions).
-
-Our main data structure is the notion of pointed item, that is meant whose purpose
-is to encode a set of positions inside a regular expression.
-The idea of formalizing pointers inside a data type by means of a labelled version
-of the data type itself is probably one of the first, major lessons learned in the
-formalization of the metatheory of programming languages. For our purposes, it is
-enough to mark positions preceding individual characters, so we shall have two kinds
-of characters •a (pp a) and a (ps a) according to the case a is pointed or not. *)
-
-inductive pitem (S: DeqSet) : Type[0] ≝
- pz: pitem S (* empty *)
- | pe: pitem S (* epsilon *)
- | ps: S → pitem S (* symbol *)
- | pp: S → pitem S (* pointed sysmbol *)
- | pc: pitem S → pitem S → pitem S (* concatenation *)
- | po: pitem S → pitem S → pitem S (* plus *)
- | pk: pitem S → pitem S. (* kleene's star *)
-
-(* A pointed regular expression (pre) is just a pointed item with an additional
-boolean, that must be understood as the possibility to have a trailing point at
-the end of the expression. As we shall see, pointed regular expressions can be
-understood as states of a DFA, and the boolean indicates if
-the state is final or not. *)
-
-definition pre ≝ λS.pitem S × bool.
-
-interpretation "pitem star" 'star a = (pk ? a).
-interpretation "pitem or" 'plus a b = (po ? a b).
-interpretation "pitem cat" 'middot a b = (pc ? a b).
-notation < ".a" non associative with precedence 90 for @{ 'pp $a}.
-notation > "`. term 90 a" non associative with precedence 90 for @{ 'pp $a}.
-interpretation "pitem pp" 'pp a = (pp ? a).
-interpretation "pitem ps" 'ps a = (ps ? a).
-interpretation "pitem epsilon" 'epsilon = (pe ?).
-interpretation "pitem empty" 'empty = (pz ?).
-
-(* The carrier $|i|$ of an item i is the regular expression obtained from i by
-removing all the points. Similarly, the carrier of a pointed regular expression
-is the carrier of its item. *)
-
-let rec forget (S: DeqSet) (l : pitem S) on l: re S ≝
- match l with
- [ pz ⇒ z ? (* `∅ *)
- | pe ⇒ ϵ
- | ps x ⇒ `x
- | pp x ⇒ `x
- | pc E1 E2 ⇒ (forget ? E1) · (forget ? E2)
- | po E1 E2 ⇒ (forget ? E1) + (forget ? E2)
- | pk E ⇒ (forget ? E)^* ].
-
-interpretation "forget" 'card a = (forget ? a).
-
-lemma erase_dot : ∀S.∀e1,e2:pitem S. |e1 · e2| = c ? (|e1|) (|e2|).
-// qed.
-
-lemma erase_plus : ∀S.∀i1,i2:pitem S.
- |i1 + i2| = |i1| + |i2|.
-// qed.
-
-lemma erase_star : ∀S.∀i:pitem S.|i^*| = |i|^*.
-// qed.
+let rec conjunct (S : DeqSet) (l : list (word S)) (r : word S → Prop) on l: Prop ≝
+match l with [ nil ⇒ True | cons w tl ⇒ r w ∧ conjunct ? tl r ].
-(*
-Comparing items and pres
-
-Items and pres are very concrete datatypes: they can be effectively compared,
-and enumerated. In particular, we can define a boolean equality beqitem and a proof
-beqitem_true that it refects propositional equality, enriching the set (pitem S)
-to a DeqSet. *)
-
-let rec beqitem S (i1,i2: pitem S) on i1 ≝
- match i1 with
- [ pz ⇒ match i2 with [ pz ⇒ true | _ ⇒ false]
- | pe ⇒ match i2 with [ pe ⇒ true | _ ⇒ false]
- | ps y1 ⇒ match i2 with [ ps y2 ⇒ y1==y2 | _ ⇒ false]
- | pp y1 ⇒ match i2 with [ pp y2 ⇒ y1==y2 | _ ⇒ false]
- | po i11 i12 ⇒ match i2 with
- [ po i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
- | _ ⇒ false]
- | pc i11 i12 ⇒ match i2 with
- [ pc i21 i22 ⇒ beqitem S i11 i21 ∧ beqitem S i12 i22
- | _ ⇒ false]
- | pk i11 ⇒ match i2 with [ pk i21 ⇒ beqitem S i11 i21 | _ ⇒ false]
- ].
-
-lemma beqitem_true: ∀S,i1,i2. iff (beqitem S i1 i2 = true) (i1 = i2).
-#S #i1 elim i1
- [#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
- |#i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % // normalize #H destruct
- |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
- [>(\P H) // | @(\b (refl …))]
- |#x #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] % normalize #H destruct
- [>(\P H) // | @(\b (refl …))]
- |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
- normalize #H destruct
- [cases (true_or_false (beqitem S i11 i21)) #H1
- [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
- |>H1 in H; normalize #abs @False_ind /2/
- ]
- |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
- ]
- |#i11 #i12 #Hind1 #Hind2 #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i3] %
- normalize #H destruct
- [cases (true_or_false (beqitem S i11 i21)) #H1
- [>(proj1 … (Hind1 i21) H1) >(proj1 … (Hind2 i22)) // >H1 in H; #H @H
- |>H1 in H; normalize #abs @False_ind /2/
- ]
- |>(proj2 … (Hind1 i21) (refl …)) >(proj2 … (Hind2 i22) (refl …)) //
- ]
- |#i3 #Hind #i2 cases i2 [||#a|#a|#i21 #i22| #i21 #i22|#i4] %
- normalize #H destruct
- [>(proj1 … (Hind i4) H) // |>(proj2 … (Hind i4) (refl …)) //]
- ]
-qed.
+(* We are ready to give the formal definition of the Kleene's star of l:
+a word w belongs to l* is and only if there exists a list of strings
+lw such that (conjunct lw l) and l = flatten lw. *)
+
+definition flatten ≝ λA.foldr (list A) (list A) (append A) [].
+
+definition star ≝ λS.λl.λw:word S.∃lw.flatten ? lw = w ∧ conjunct ? lw l.
-definition DeqItem ≝ λS.
- mk_DeqSet (pitem S) (beqitem S) (beqitem_true S).
+notation "a ^ *" non associative with precedence 90 for @{ 'star $a}.
+interpretation "star lang" 'star l = (star ? l).
-(* We also add a couple of unification hints to allow the type inference system
-to look at (pitem S) as the carrier of a DeqSet, and at beqitem as if it was the
-equality function of a DeqSet. *)
+(* The derivative of a language A with respect to a character a is the set of
+all strings w such that aw is in A. *)
-unification hint 0 ≔ S;
- X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
-(* ---------------------------------------- *) ⊢
- pitem S ≡ carr X.
-
-unification hint 0 ≔ S,i1,i2;
- X ≟ mk_DeqSet (pitem S) (beqitem S) (beqitem_true S)
-(* ---------------------------------------- *) ⊢
- beqitem S i1 i2 ≡ eqb X i1 i2.
+definition deriv ≝ λS.λA:word S → Prop.λa,w. A (a::w).
(*
-Semantics of pointed regular expressions
-
-The intuitive semantic of a point is to mark the position where
-we should start reading the regular expression. The language associated
-to a pre is the union of the languages associated with its points. *)
-
-let rec in_pl (S : DeqSet) (r : pitem S) on r : word S → Prop ≝
-match r with
-[ pz ⇒ ∅
-| pe ⇒ ∅
-| ps _ ⇒ ∅
-| pp x ⇒ { (x::[]) }
-| pc r1 r2 ⇒ (in_pl ? r1) · \sem{forget ? r2} ∪ (in_pl ? r2)
-| po r1 r2 ⇒ (in_pl ? r1) ∪ (in_pl ? r2)
-| pk r1 ⇒ (in_pl ? r1) · \sem{forget ? r1}^* ].
-
-interpretation "in_pl" 'in_l E = (in_pl ? E).
-interpretation "in_pl mem" 'mem w l = (in_pl ? l w).
-
-definition in_prl ≝ λS : DeqSet.λp:pre S.
- if (\snd p) then \sem{\fst p} ∪ {ϵ} else \sem{\fst p}.
+Language equalities
+
+Equality between languages is just the usual extensional equality between
+sets. The operation of concatenation behaves well with respect to this equality. *)
+
+lemma cat_ext_l: ∀S.∀A,B,C:word S →Prop.
+ A ≐ C → A · B ≐ C · B.
+#S #A #B #C #H #w % * #w1 * #w2 * * #eqw #inw1 #inw2
+cases (H w1) /6/
+qed.
+
+lemma cat_ext_r: ∀S.∀A,B,C:word S →Prop.
+ B ≐ C → A · B ≐ A · C.
+#S #A #B #C #H #w % * #w1 * #w2 * * #eqw #inw1 #inw2
+cases (H w2) /6/
+qed.
-interpretation "in_prl mem" 'mem w l = (in_prl ? l w).
-interpretation "in_prl" 'in_l E = (in_prl ? E).
-
-(* The following, trivial lemmas are only meant for rewriting purposes. *)
-
-lemma sem_pre_true : ∀S.∀i:pitem S.
- \sem{〈i,true〉} = \sem{i} ∪ {ϵ}.
-// qed.
-
-lemma sem_pre_false : ∀S.∀i:pitem S.
- \sem{〈i,false〉} = \sem{i}.
-// qed.
-
-lemma sem_cat: ∀S.∀i1,i2:pitem S.
- \sem{i1 · i2} = \sem{i1} · \sem{|i2|} ∪ \sem{i2}.
-// qed.
-
-lemma sem_cat_w: ∀S.∀i1,i2:pitem S.∀w.
- \sem{i1 · i2} w = ((\sem{i1} · \sem{|i2|}) w ∨ \sem{i2} w).
-// qed.
-
-lemma sem_plus: ∀S.∀i1,i2:pitem S.
- \sem{i1 + i2} = \sem{i1} ∪ \sem{i2}.
-// qed.
-
-lemma sem_plus_w: ∀S.∀i1,i2:pitem S.∀w.
- \sem{i1 + i2} w = (\sem{i1} w ∨ \sem{i2} w).
-// qed.
-
-lemma sem_star : ∀S.∀i:pitem S.
- \sem{i^*} = \sem{i} · \sem{|i|}^*.
-// qed.
-
-lemma sem_star_w : ∀S.∀i:pitem S.∀w.
- \sem{i^*} w = (∃w1,w2.w1 @ w2 = w ∧ \sem{i} w1 ∧ \sem{|i|}^* w2).
-// qed.
-
-(* Below are a few, simple, semantic properties of items. In particular:
-- not_epsilon_item : ∀S:DeqSet.∀i:pitem S. ¬ (\sem{i} ϵ).
-- epsilon_pre : ∀S.∀e:pre S. (\sem{i} ϵ) ↔ (\snd e = true).
-- minus_eps_item: ∀S.∀i:pitem S. \sem{i} =1 \sem{i}-{[ ]}.
-- minus_eps_pre: ∀S.∀e:pre S. \sem{\fst e} =1 \sem{e}-{[ ]}.
-The first property is proved by a simple induction on $i$; the other
-results are easy corollaries. We need an auxiliary lemma first. *)
-
-lemma append_eq_nil : ∀S.∀w1,w2:word S. w1 @ w2 = ϵ → w1 = ϵ.
-#S #w1 #w2 cases w1 // #a #tl normalize #H destruct qed.
-
-lemma not_epsilon_lp : ∀S:DeqSet.∀e:pitem S. ¬ (ϵ ∈ e).
-#S #e elim e normalize /2/
- [#r1 #r2 * #n1 #n2 % * /2/ * #w1 * #w2 * * #H
- >(append_eq_nil …H…) /2/
- |#r1 #r2 #n1 #n2 % * /2/
- |#r #n % * #w1 * #w2 * * #H >(append_eq_nil …H…) /2/
+(* Concatenating a language with the empty language results in the
+empty language. *)
+lemma cat_empty_l: ∀S.∀A:word S→Prop. ∅ · A ≐ ∅.
+#S #A #w % [|*] * #w1 * #w2 * * #_ *
+qed.
+
+(* Concatenating a language l with the singleton language containing the
+empty string, results in the language l; that is {ϵ} is a left and right
+unit with respect to concatenation. *)
+
+lemma epsilon_cat_r: ∀S.∀A:word S →Prop.
+ A · {ϵ} ≐ A.
+#S #A #w %
+ [* #w1 * #w2 * * #eqw #inw1 normalize #eqw2 <eqw //
+ |#inA @(ex_intro … w) @(ex_intro … [ ]) /3/
+ ]
+qed.
+
+lemma epsilon_cat_l: ∀S.∀A:word S →Prop.
+ {ϵ} · A ≐ A.
+#S #A #w %
+ [* #w1 * #w2 * * #eqw normalize #eqw2 <eqw <eqw2 //
+ |#inA @(ex_intro … ϵ) @(ex_intro … w) /3/
]
qed.
-lemma epsilon_to_true : ∀S.∀e:pre S. ϵ ∈ e → \snd e = true.
-#S * #i #b cases b // normalize #H @False_ind /2/
+(* Concatenation is distributive w.r.t. union. *)
+
+lemma distr_cat_r: ∀S.∀A,B,C:word S →Prop.
+ (A ∪ B) · C ≐ A · C ∪ B · C.
+#S #A #B #C #w %
+ [* #w1 * #w2 * * #eqw * /6/ |* * #w1 * #w2 * * /6/]
qed.
-lemma true_to_epsilon : ∀S.∀e:pre S. \snd e = true → ϵ ∈ e.
-#S * #i #b #btrue normalize in btrue; >btrue %2 //
+lemma distr_cat_r_eps: ∀S.∀A,C:word S →Prop.
+ (A ∪ {ϵ}) · C ≐ A · C ∪ C.
+ #S #A #C @eqP_trans [|@distr_cat_r |@eqP_union_l @epsilon_cat_l]
qed.
-lemma minus_eps_item: ∀S.∀i:pitem S. \sem{i} =1 \sem{i}-{[ ]}.
-#S #i #w %
- [#H whd % // normalize @(not_to_not … (not_epsilon_lp …i)) //
- |* //
+(* The following is a major property of derivatives *)
+
+lemma deriv_middot: ∀S,A,B,a. ¬ A ϵ → deriv S (A·B) a ≐ (deriv S A a) · B.
+#S #A #B #a #noteps #w normalize %
+ [* #w1 cases w1
+ [* #w2 * * #_ #Aeps @False_ind /2/
+ |#b #w2 * #w3 * * whd in ⊢ ((??%?)→?); #H destruct
+ #H #H1 @(ex_intro … w2) @(ex_intro … w3) % // % //
+ ]
+ |* #w1 * #w2 * * #H #H1 #H2 @(ex_intro … (a::w1))
+ @(ex_intro … w2) % // % normalize //
]
+qed.
+
+(*
+Main Properties of Kleene's star
+
+We conclude this section with some important properties of Kleene's
+star that will be used in the following chapters. *)
+
+lemma espilon_in_star: ∀S.∀A:word S → Prop.
+ A^* ϵ.
+#S #A @(ex_intro … [ ]) normalize /2/
qed.
-lemma minus_eps_pre: ∀S.∀e:pre S. \sem{\fst e} =1 \sem{e}-{[ ]}.
-#S * #i *
- [>sem_pre_true normalize in ⊢ (??%?); #w %
- [/3/ | * * // #H1 #H2 @False_ind @(absurd …H1 H2)]
- |>sem_pre_false normalize in ⊢ (??%?); #w % [ /3/ | * // ]
+lemma cat_to_star:∀S.∀A:word S → Prop.
+ ∀w1,w2. A w1 → A^* w2 → A^* (w1@w2).
+#S #A #w1 #w2 #Aw * #l * #H #H1 @(ex_intro … (w1::l))
+% normalize destruct /2/
+qed.
+
+lemma fix_star: ∀S.∀A:word S → Prop.
+ A^* ≐ A · A^* ∪ {ϵ}.
+#S #A #w %
+ [* #l generalize in match w; -w cases l [normalize #w * /2/]
+ #w1 #tl #w * whd in ⊢ ((??%?)→?); #eqw whd in ⊢ (%→?); *
+ #w1A #cw1 %1 @(ex_intro … w1) @(ex_intro … (flatten S tl))
+ % destruct /2/ whd @(ex_intro … tl) /2/
+ |* [2: whd in ⊢ (%→?); #eqw <eqw //]
+ * #w1 * #w2 * * #eqw <eqw @cat_to_star
]
-qed.
\ No newline at end of file
+qed.
+
+lemma star_fix_eps : ∀S.∀A:word S → Prop.
+ A^* ≐ (A - {ϵ}) · A^* ∪ {ϵ}.
+#S #A #w %
+ [* #l elim l
+ [* whd in ⊢ ((??%?)→?); #eqw #_ %2 <eqw //
+ |* [#tl #Hind * #H * #_ #H2 @Hind % [@H | //]
+ |#a #w1 #tl #Hind * whd in ⊢ ((??%?)→?); #H1 * #H2 #H3 %1
+ @(ex_intro … (a::w1)) @(ex_intro … (flatten S tl)) %
+ [% [@H1 | normalize % [@H2 | % #H4 destruct (H4)] ]
+ |whd @(ex_intro … tl) /2/]
+ ]
+ ]
+ |* [* #w1 * #w2 * * #eqw * #H1 #_ <eqw @cat_to_star //
+ | whd in ⊢ (%→?); #H <H //
+ ]
+ ]
+qed.
+
+lemma star_epsilon: ∀S:DeqSet.∀A:word S → Prop.
+ A^* ∪ {ϵ} ≐ A^*.
+#S #A #w % /2/ * //
+qed.
+