2 * ----------------------------------------------------------------------
3 * PXP: The polymorphic XML parser for Objective Caml.
4 * Copyright by Gerd Stolpmann. See LICENSE for details.
14 (**********************************************************************)
16 class dtd the_warner init_encoding =
18 val mutable root = (None : string option)
19 val mutable id = (None : dtd_id option)
21 val warner = (the_warner : collect_warnings)
22 val encoding = init_encoding
23 val lexerset = Pxp_lexers.get_lexer_set init_encoding
25 val elements = (Hashtbl.create 100 : (string,dtd_element) Hashtbl.t)
26 val gen_entities = (Hashtbl.create 100 : (string,entity * bool) Hashtbl.t)
27 val par_entities = (Hashtbl.create 100 : (string,entity) Hashtbl.t)
28 val notations = (Hashtbl.create 100 : (string,dtd_notation) Hashtbl.t)
29 val pinstr = (Hashtbl.create 100 : (string,proc_instruction) Hashtbl.t)
30 val mutable element_names = []
31 val mutable gen_entity_names = []
32 val mutable par_entity_names = []
33 val mutable notation_names = []
34 val mutable pinstr_names = []
36 val mutable allow_arbitrary = false
37 val mutable standalone_declaration = false
39 val mutable validated = false
42 let w = new drop_warnings in
44 (new internal_entity self "lt" w "<" false false false encoding)
47 (new internal_entity self "gt" w ">" false false false encoding)
50 (new internal_entity self "amp" w "&" false false false encoding)
53 (new internal_entity self "apos" w "'" false false false encoding)
56 (new internal_entity self "quot" w """ false false false encoding)
60 method encoding = encoding
62 method warner = warner
78 method standalone_declaration = standalone_declaration
80 method set_standalone_declaration b =
81 standalone_declaration <- b
83 method allow_arbitrary =
84 allow_arbitrary <- true
86 method disallow_arbitrary =
87 allow_arbitrary <- false
89 method arbitrary_allowed = allow_arbitrary
95 method add_element el =
96 (* raises Not_found if 'el' has already been added *)
97 (* Note: 'el' is encoded in the same way as 'self'! *)
98 let name = el # name in
99 check_name warner name;
100 if Hashtbl.mem elements name then
102 Hashtbl.add elements name el;
103 element_names <- name :: element_names;
107 method add_gen_entity en extdecl =
108 (* The following is commented out; perhaps there should be an option
109 * to reactivate it on demand
111 (* raises Validation_error if the predefines entities 'lt', 'gt', 'amp',
112 * 'quot', and 'apos' are redeclared with an improper value.
114 if en # encoding <> encoding then
115 failwith "Pxp_dtd.dtd # add_gen_entity: Inconsistent encodings";
116 let name = en # name in
117 check_name warner name;
118 if Hashtbl.mem gen_entities name then begin
119 if List.mem name [ "lt"; "gt"; "amp"; "quot"; "apos" ] then begin
120 (* These are allowed to be declared several times *)
121 let (rt,_) = en # replacement_text in
122 let toks = tokens_of_content_string lexerset rt in
124 begin match toks with
125 [CRef 60] -> if name <> "lt" then raise Not_found
126 | [CharData ">"] -> if name <> "gt" then raise Not_found
127 | [CRef 62] -> if name <> "gt" then raise Not_found
128 | [CRef 38] -> if name <> "amp" then raise Not_found
129 | [CharData "'"] -> if name <> "apos" then raise Not_found
130 | [CRef 39] -> if name <> "apos" then raise Not_found
131 | [CharData "\""] -> if name <> "quot" then raise Not_found
132 | [CRef 34] -> if name <> "quot" then raise Not_found
133 | _ -> raise Not_found
137 raise (Validation_error("Predefined entity `" ^ name ^
141 warner # warn ("Entity `" ^ name ^ "' declared twice")
144 Hashtbl.add gen_entities name (en, extdecl);
145 gen_entity_names <- name :: gen_entity_names
149 method add_par_entity en =
150 if en # encoding <> encoding then
151 failwith "Pxp_dtd.dtd # add_par_entity: Inconsistent encodings";
152 let name = en # name in
153 check_name warner name;
154 if not (Hashtbl.mem par_entities name) then begin
155 Hashtbl.add par_entities name en;
156 par_entity_names <- name :: par_entity_names
159 warner # warn ("Entity `" ^ name ^ "' declared twice")
162 method add_notation no =
163 (* raises Validation_error if 'no' already added *)
164 if no # encoding <> encoding then
165 failwith "Pxp_dtd.dtd # add_notation: Inconsistent encodings";
166 let name = no # name in
167 check_name warner name;
168 if Hashtbl.mem notations name then
169 raise (Validation_error("Notation `" ^ name ^ "' declared twice"));
170 Hashtbl.add notations name no;
171 notation_names <- name :: notation_names
174 method add_pinstr pi =
175 if pi # encoding <> encoding then
176 failwith "Pxp_dtd.dtd # add_pinstr: Inconsistent encodings";
177 let name = pi # target in
178 check_name warner name;
180 if String.length name >= 4 && String.sub name 0 4 = "pxp:" then begin
183 let _, optname, atts = pi # parse_pxp_option in
184 begin match optname with
185 "optional-element-and-notation-declarations" ->
186 self # allow_arbitrary
187 | "optional-attribute-declarations" ->
188 let lexers = Pxp_lexers.get_lexer_set encoding in
190 try List.assoc "elements" atts
192 raise(Error("Missing `elements' attribute for pxp:dtd"))
194 let el = split_attribute_value lexers el_string in
198 try Hashtbl.find elements e_name
201 raise(Error("Reference to unknown element `" ^
208 raise(Error("Unknown PXP option `" ^
212 raise(Error("The processing instruction target `" ^
213 name ^ "' is not defined by this PXP version"))
216 (*----------------------------------------------------------------------
217 * SUPPORT FOR DEPRECATED PI OPTIONS:
218 * - <?xml:allow_undeclared_elements_and_notations?>
219 * is now <?pxp:dtd optional-element-and-notation-declarations?>
220 * - <?xml:allow_undeclared_attributes <elementname>?>
221 * is now <?pxp:dtd optional-attribute-declarations
222 * elements='<elementname> ...'?>
223 * Please update your DTDs! Alternatively, you may uncommment the
224 * following piece of code.
226 (* if name = "xml:allow_undeclared_elements_and_notations" then *)
227 (* self # allow_arbitrary; *)
228 (* if name = "xml:allow_undeclared_attributes" then begin *)
229 (* let v = pi # value in *)
232 (* Hashtbl.find elements v *)
235 (* raise(Validation_error("Reference to undeclared element `"*)
238 (* e # allow_arbitrary; *)
240 (*----------------------------------------------------------------------
244 Hashtbl.add pinstr name pi;
245 pinstr_names <- name :: pinstr_names;
248 method element name =
249 (* returns the element 'name' or raises Validation_error if not found *)
251 Hashtbl.find elements name
254 if allow_arbitrary then
257 raise(Validation_error("Reference to undeclared element `" ^ name ^ "'"))
259 method element_names =
260 (* returns the list of all names of element declarations *)
264 method gen_entity name =
265 (* returns the entity 'name' or raises WF_error if not found *)
267 Hashtbl.find gen_entities name
270 raise(WF_error("Reference to undeclared general entity `" ^ name ^ "'"))
273 method gen_entity_names = gen_entity_names
276 method par_entity name =
277 (* returns the entity 'name' or raises WF_error if not found *)
279 Hashtbl.find par_entities name
282 raise(WF_error("Reference to undeclared parameter entity `" ^ name ^ "'"))
285 method par_entity_names = par_entity_names
288 method notation name =
289 (* returns the notation 'name' or raises Validation_error if not found *)
291 Hashtbl.find notations name
294 if allow_arbitrary then
297 raise(Validation_error("Reference to undeclared notation `" ^ name ^ "'"))
300 method notation_names = notation_names
304 (* returns the list of all processing instructions contained in the DTD
307 Hashtbl.find_all pinstr name
310 method pinstr_names = pinstr_names
312 method write os enc doctype =
314 write_markup_string ~from_enc:encoding ~to_enc:enc os in
317 if String.contains s '"' then
320 wms ("\"" ^ s ^ "\"");
323 if doctype then begin
326 None -> failwith "#write: DTD without root";
336 try Hashtbl.find notations name with Not_found -> assert false in
337 notation # write os enc)
338 (List.sort compare notation_names);
340 (* Unparsed entities: *)
344 try Hashtbl.find gen_entities name with Not_found -> assert false
346 if ent # is_ndata then begin
347 let xid = ent # ext_id in
348 let notation = ent # notation in
349 wms ("<!ENTITY " ^ name ^ " " );
357 if (s <> "") then begin
362 failwith "#write: External ID Anonymous cannot be represented"
364 wms (" NDATA " ^ notation ^ ">\n");
367 (List.sort compare gen_entity_names);
373 try Hashtbl.find elements name with Not_found -> assert false in
374 element # write os enc)
375 (List.sort compare element_names);
377 (* Processing instructions: *)
381 try Hashtbl.find pinstr name with Not_found -> assert false in
383 (List.sort compare pinstr_names);
388 method write_compact_as_latin1 os doctype =
389 self # write os `Enc_iso88591 doctype
393 (************************************************************)
395 (************************************************************)
397 method only_deterministic_models =
400 let cm = el # content_model in
403 if el # content_dfa = None then
404 raise(Validation_error("The content model of element `" ^
405 n ^ "' is not deterministic"))
413 if validated or allow_arbitrary then
416 (* Validity constraint: Notations in NDATA entity declarations must
422 try Hashtbl.find gen_entities name with Not_found -> assert false
424 if ent # is_ndata then begin
425 let xid = ent # ext_id in
426 let notation = ent # notation in
428 ignore(self # notation notation)
429 (* Raises Validation_error if the constraint is violated *)
436 (* Validate the elements: *)
442 (* Check the root element: *)
443 (* TODO: Check if this piece of code is executed at all! *)
444 begin match root with
448 let _ = Hashtbl.find elements r in ()
451 raise(Validation_error("The root element is not declared"))
460 (************************************************************)
465 (**********************************************************************)
467 and dtd_element the_dtd the_name =
469 val dtd = (the_dtd : dtd)
471 val lexerset = Pxp_lexers.get_lexer_set (the_dtd # encoding)
472 val mutable content_model = Unspecified
473 val mutable content_model_validated = false
474 val mutable content_dfa = lazy None
476 val mutable externally_declared = false
478 val mutable attributes =
479 ([] : (string * ((att_type * att_default) * bool)) list)
480 val mutable attributes_validated = false
482 val mutable id_att_name = None
483 val mutable idref_att_names = []
485 val mutable allow_arbitrary = false
489 method set_cm_and_extdecl m extdecl =
490 if content_model = Unspecified then begin
492 content_model_validated <- false;
493 content_dfa <- lazy (self # compute_content_dfa);
494 externally_declared <- extdecl;
498 raise(Validation_error("Element `" ^ name ^ "' has already a content model"))
500 method content_model = content_model
502 method content_dfa = Lazy.force content_dfa
504 method private compute_content_dfa =
505 match content_model with
507 ( try Some (dfa_of_regexp_content_model re)
508 with Not_found -> None
513 method externally_declared = externally_declared
515 method encoding = dtd # encoding
517 method allow_arbitrary =
518 allow_arbitrary <- true
520 method disallow_arbitrary =
521 allow_arbitrary <- false
523 method arbitrary_allowed = allow_arbitrary
525 method add_attribute aname t d extdecl =
526 if aname <> "xml:lang" & aname <> "xml:space" then
527 check_name (dtd#warner) aname;
528 if List.mem_assoc aname attributes then
529 dtd # warner # warn ("More than one declaration for attribute `" ^
530 aname ^ "' of element type `" ^ name ^ "'")
532 begin match aname with
536 let l' = Sort.list ( <= ) l in
537 if l' <> [ "default"; "preserve" ] then
538 raise(Validation_error("Declaration of attribute `xml:space' does not conform to XML specification"))
540 raise(Validation_error("Declaration of attribute `xml:space' does not conform to XML specification"))
546 id_att_name <- Some aname;
547 | (A_idref | A_idrefs) ->
548 idref_att_names <- aname :: idref_att_names
552 attributes <- (aname, ((t,d),extdecl)) :: attributes;
553 attributes_validated <- false;
557 method attribute attname =
559 fst (List.assoc attname attributes)
562 if allow_arbitrary then
565 raise(Validation_error("Attribute `" ^ attname ^ "' of element `"
566 ^ name ^ "' not declared"))
568 method attribute_violates_standalone_declaration attname v =
570 let (atype, adefault), extdecl = List.assoc attname attributes in
574 adefault <> D_required && adefault <> D_implied
575 (* i.e. adefault matches D_default or D_fixed *)
578 normalization_changes_value lexerset atype s
582 if allow_arbitrary then
585 raise(Validation_error("Attribute `" ^ attname ^ "' of element `"
586 ^ name ^ "' not declared"))
589 method attribute_names =
590 List.map fst attributes
592 method names_of_required_attributes =
595 (fun (n,((t,d),_)) ->
596 if d = D_required then
602 method id_attribute_name = id_att_name
604 method idref_attribute_names = idref_att_names
607 method write os enc =
608 let encoding = self # encoding in
610 write_markup_string ~from_enc:encoding ~to_enc:enc os in
612 let rec write_contentspec cs =
615 failwith "#write: Unspecified content model found"
622 write_mixedspec_list ml;
625 write_children re false
627 and write_mixedspec_list ml =
631 if ml' <> [] then wms "|";
632 write_mixedspec_list ml';
635 if ml' <> [] then wms "|";
636 write_mixedspec_list ml';
640 and write_children re cp =
643 let p = needs_parens re' in
645 write_children re' cp;
649 let p = needs_parens re' in
651 write_children re' cp;
655 let p = needs_parens re' in
657 write_children re' cp;
664 write_children re1' true;
668 write_children ren' true;
672 failwith "#write: Illegal content model"
679 write_children re1' true;
683 write_children ren' true;
687 failwith "#write: Illegal content model"
691 if not cp then wms "(";
693 if not cp then wms ")";
695 and needs_parens re =
697 (Optional _ | Repeated _ | Repeated1 _ ) -> true
701 wms ("<!ELEMENT " ^ name ^ " ");
702 write_contentspec content_model;
705 wms ("<!ATTLIST " ^ name);
707 (fun (n,((t,d),_)) ->
710 A_cdata -> wms " CDATA";
712 | A_idref -> wms " IDREF";
713 | A_idrefs -> wms " IDREFS";
714 | A_entity -> wms " ENTITY";
715 | A_entities -> wms " ENTITIES";
716 | A_nmtoken -> wms " NMTOKEN";
717 | A_nmtokens -> wms " NMTOKENS";
729 failwith "#write: Illegal content model";
743 failwith "#write: Illegal content model";
748 D_required -> wms " #REQUIRED"
749 | D_implied -> wms " #IMPLIED"
752 write_data_string ~from_enc:encoding ~to_enc:enc os s;
756 write_data_string ~from_enc:encoding ~to_enc:enc os s;
764 method write_compact_as_latin1 os =
765 self # write os `Enc_iso88591
767 (************************************************************)
769 (************************************************************)
772 self # validate_attributes();
773 self # validate_content_model()
775 method private validate_attributes() =
776 if attributes_validated then
779 (* Validity Constraint: One ID per Element Type *)
780 let n = count (fun (n,((t,d),_)) -> t = A_id) attributes in
782 raise(Validation_error("More than one ID attribute for element `" ^ name ^ "'"));
783 (* Validity Constraint: ID Attribute Default *)
785 (fun (n,((t,d),_)) ->
786 t = A_id & (d <> D_required & d <> D_implied))
789 raise(Validation_error("ID attribute must be #IMPLIED or #REQUIRED; element `" ^ name ^ "'"));
790 (* Validity Constraint: One Notation per Element Type *)
791 let n = count (fun (n,((t,d),_)) ->
792 match t with A_notation _ -> true | _ -> false)
795 raise(Validation_error("More than one NOTATION attribute for element `" ^ name ^ "'"));
796 (* Validity Constraint: Notation Attributes [second part] *)
798 (fun (n,((t,d),_)) ->
803 let _ = dtd # notation nname in ())
807 (* Validity Constraint: Attribute Default Legal *)
809 (fun (n,((t,d),_)) ->
812 let lexical_error() =
813 lazy (raise(Validation_error("Default value for attribute `" ^ n ^ "' is lexically malformed"))) in
814 check_attribute_value_lexically lexerset (lexical_error()) t v;
816 (A_entity|A_entities) ->
819 let en, extdecl = dtd # gen_entity nd in
820 if not (en # is_ndata) then
821 raise(Validation_error("Attribute default value must be the name of an NDATA entity; attribute `" ^ n ^ "' in declaration for element `" ^ name ^ "'"));
822 (* if dtd # standalone_declaration && extdecl then
823 raise(Validation_error("Attribute default value violates the standalone declaration; attribute `" ^ n ^ "' in declaration for element `" ^ name ^ "'"));
824 -- This is checked anyway when the attribute value is normalized
827 (split_attribute_value lexerset v)
829 if not (List.mem v nl) then
830 raise(Validation_error("Illegal default value for attribute `" ^ n ^ "' in declaration for element `" ^ name ^ "'"));
832 if not (List.mem v nl) then
833 raise(Validation_error("Illegal default value for attribute `" ^ n ^ "' in declaration for element `" ^ name ^ "'"));
841 | D_default v -> check v
842 | D_fixed v -> check v
846 (* Ok: This element declaration is valid *)
847 attributes_validated <- true;
851 method private validate_content_model () =
853 * - Validity Constraint: No Duplicate Types
854 * It is not an error if there is a child in the declaration for which
855 * no element declaration is provided.
857 match content_model with
859 dtd # warner # warn ("Element type `" ^ name ^ "' mentioned but not declared");
863 | Mixed (pcdata :: l) ->
864 (* MPCDATA is always the first element by construction *)
865 assert (pcdata = MPCDATA);
867 raise (Validation_error("Double children in declaration for element `" ^ name ^ "'"))
873 (************************************************************)
877 and dtd_notation the_name the_xid init_encoding =
880 val xid = (the_xid : ext_id)
881 val encoding = (init_encoding : Pxp_types.rep_encoding)
884 method encoding = encoding
886 method write os enc =
888 write_markup_string ~from_enc:encoding ~to_enc:enc os in
891 if String.contains s '"' then
894 wms ("\"" ^ s ^ "\"");
897 wms ("<!NOTATION " ^ name ^ " ");
905 if (s <> "") then begin
910 failwith "#write: External ID Anonymous cannot be represented"
914 method write_compact_as_latin1 os =
915 self # write os `Enc_iso88591
919 and proc_instruction the_target the_value init_encoding =
921 val target = the_target
922 val value = (the_value : string)
923 val encoding = (init_encoding : Pxp_types.rep_encoding)
927 ("xml"|"xmL"|"xMl"|"xML"|"Xml"|"XmL"|"XMl"|"XML") ->
928 (* This is an error, not a warning, because I do not have a
929 * "warner" object by hand.
931 raise(WF_error("Reserved processing instruction"))
934 method target = target
936 method encoding = encoding
938 method write os enc =
940 write_markup_string ~from_enc:encoding ~to_enc:enc os in
948 method write_compact_as_latin1 os =
949 self # write os `Enc_iso88591
951 method parse_pxp_option =
952 let lexers = get_lexer_set encoding in
954 let toks = tokens_of_xml_pi lexers value in (* may raise WF_error *)
955 begin match toks with
956 (Pro_name option_name) :: toks' ->
957 let atts = decode_xml_pi toks' in (* may raise WF_error *)
958 (target, option_name, atts)
960 raise(Error("Bad PXP processing instruction"))
964 raise(Error("Bad PXP processing instruction"))
970 (* ======================================================================
974 * Revision 1.1 2000/11/17 09:57:29 lpadovan
977 * Revision 1.10 2000/08/18 21:18:45 gerd
978 * Updated wrong comments for methods par_entity and gen_entity.
979 * These can raise WF_error and not Validation_error, and this is the
982 * Revision 1.9 2000/07/25 00:30:01 gerd
983 * Added support for pxp:dtd PI options.
985 * Revision 1.8 2000/07/23 02:16:34 gerd
988 * Revision 1.7 2000/07/16 17:50:01 gerd
991 * Revision 1.6 2000/07/16 16:34:41 gerd
992 * New method 'write', the successor of 'write_compact_as_latin1'.
994 * Revision 1.5 2000/07/14 13:56:48 gerd
995 * Added methods id_attribute_name and idref_attribute_names.
997 * Revision 1.4 2000/07/09 00:13:37 gerd
998 * Added methods gen_entity_names, par_entity_names.
1000 * Revision 1.3 2000/07/04 22:10:55 gerd
1001 * Update: collect_warnings -> drop_warnings.
1002 * Update: Case ext_id = Anonymous.
1004 * Revision 1.2 2000/06/14 22:19:06 gerd
1005 * Added checks such that it is impossible to mix encodings.
1007 * Revision 1.1 2000/05/29 23:48:38 gerd
1008 * Changed module names:
1009 * Markup_aux into Pxp_aux
1010 * Markup_codewriter into Pxp_codewriter
1011 * Markup_document into Pxp_document
1012 * Markup_dtd into Pxp_dtd
1013 * Markup_entity into Pxp_entity
1014 * Markup_lexer_types into Pxp_lexer_types
1015 * Markup_reader into Pxp_reader
1016 * Markup_types into Pxp_types
1017 * Markup_yacc into Pxp_yacc
1018 * See directory "compatibility" for (almost) compatible wrappers emulating
1019 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
1021 * ======================================================================
1023 * Revision 1.18 2000/05/28 17:24:55 gerd
1026 * Revision 1.17 2000/05/27 19:21:25 gerd
1027 * Implemented the changes of rev. 1.10 of markup_dtd.mli.
1029 * Revision 1.16 2000/05/20 20:31:40 gerd
1030 * Big change: Added support for various encodings of the
1031 * internal representation.
1033 * Revision 1.15 2000/05/14 21:50:07 gerd
1034 * Updated: change in internal_entity.
1036 * Revision 1.14 2000/05/06 23:08:46 gerd
1037 * It is possible to allow undeclared attributes.
1039 * Revision 1.13 2000/05/01 20:42:46 gerd
1040 * New method write_compact_as_latin1.
1042 * Revision 1.12 2000/05/01 15:16:57 gerd
1043 * The errors "undeclared parameter/general entities" are
1044 * well-formedness errors, not validation errors.
1046 * Revision 1.11 2000/03/11 22:58:15 gerd
1047 * Updated to support Markup_codewriter.
1049 * Revision 1.10 2000/01/20 20:53:47 gerd
1050 * Changed such that it runs with Markup_entity's new interface.
1052 * Revision 1.9 1999/11/09 22:15:41 gerd
1053 * Added method "arbitrary_allowed".
1055 * Revision 1.8 1999/09/01 22:52:22 gerd
1056 * If 'allow_arbitrary' is in effect, no validation happens anymore.
1058 * Revision 1.7 1999/09/01 16:21:24 gerd
1059 * Added several warnings.
1060 * The attribute type of "xml:space" is now strictly checked.
1062 * Revision 1.6 1999/08/15 20:34:21 gerd
1063 * Improved error messages.
1064 * Bugfix: It is no longer allowed to create processing instructions
1065 * with target "xml".
1067 * Revision 1.5 1999/08/15 02:20:16 gerd
1068 * New feature: a DTD can allow arbitrary elements.
1070 * Revision 1.4 1999/08/15 00:21:39 gerd
1071 * Comments have been updated.
1073 * Revision 1.3 1999/08/14 22:12:52 gerd
1074 * Several functions have now a "warner" as argument which is
1075 * an object with a "warn" method. This is used to warn about characters
1076 * that cannot be represented in the Latin 1 alphabet.
1077 * Bugfix: if two general entities with the same name are definied,
1078 * the first counts, not the second.
1080 * Revision 1.2 1999/08/11 14:56:35 gerd
1081 * Declaration of the predfined entities {lt,gt,amp,quot,apos}
1082 * is no longer forbidden; but the original definition cannot be overriddden.
1083 * TODO: If these entities are redeclared with problematic values,
1084 * the user should be warned.
1086 * Revision 1.1 1999/08/10 00:35:51 gerd