2 * ----------------------------------------------------------------------
3 * PXP: The polymorphic XML parser for Objective Caml.
4 * Copyright by Gerd Stolpmann. See LICENSE for details.
22 | Pro_string of string (* "..." or '...' *)
27 (* The class without properties; but you can still compare if two objects
32 | Begin_entity (* Beginning of entity *)
33 | End_entity (* End of entity *)
34 | Comment_begin (* <!-- *)
35 | Comment_material of string (* within a comment *)
36 | Comment_end (* --> *)
37 | Ignore (* ignored whitespace *)
39 | Rangle (* > as tag delimiter *)
40 | Rangle_empty (* /> as tag delimiter *)
41 | Percent (* % followed by space in declaration *)
42 | Plus (* + in declaration *)
43 | Star (* * in declaration *)
44 | Bar (* | in declaration *)
45 | Comma (* , in declaration *)
46 | Qmark (* ? in declaration *)
47 | Pcdata (* #PCDATA in declaration *)
48 | Required (* #REQUIRED in declaration *)
49 | Implied (* #IMPLIED in declaration *)
50 | Fixed (* #FIXED in declaration *)
51 | Bof (* A marker for 'beginning of file' *)
52 | Eof (* End of file *)
53 | Conditional_begin of entity_id (* <![ in declaration *)
54 | Conditional_body of entity_id (* [ in declaration *)
55 | Conditional_end of entity_id (* ]]> in declaration *)
56 | Doctype of entity_id (* <!DOCTYPE *)
57 | Doctype_rangle of entity_id (* > as DOCTYPE delimiter *)
58 | Dtd_begin of entity_id (* '[' after DOCTYPE *)
59 | Dtd_end of entity_id (* ']' *)
60 | Decl_element of entity_id (* <!ELEMENT *)
61 | Decl_attlist of entity_id (* <!ATTLIST *)
62 | Decl_entity of entity_id (* <!ENTITY *)
63 | Decl_notation of entity_id (* <!NOTATION *)
64 | Decl_rangle of entity_id (* > *)
65 | Lparen of entity_id (* ( in declaration *)
66 | Rparen of entity_id (* ) in declaration *)
67 | RparenPlus of entity_id (* )+ in declaration *)
68 | RparenStar of entity_id (* )* in declaration *)
69 | RparenQmark of entity_id (* )? in declaration *)
71 | Tag_beg of (string*entity_id) (* <name *)
72 | Tag_end of (string*entity_id) (* </name *)
74 | PI of (string*string) (* <?name ... ?> *)
75 | PI_xml of (prolog_token list) (* <?xml ...?> *)
76 | Cdata of string (* <![CDATA[...]]> *)
77 | CRef of int (* &#digits; *)
78 | ERef of string (* &name; *)
79 | PERef of string (* %name; *)
80 | CharData of string (* any characters not otherwise matching *)
82 | Name of string (* name *)
83 | Nametoken of string (* nmtoken but not name *)
84 | Attval of string (* attribute value; may contain entity refs *)
85 | Attval_nl_normalized of string
86 | Unparsed_string of string (* "data" or 'data' *)
89 (**********************************************************************)
92 let string_of_tok tok =
94 Begin_entity -> "Begin_entity"
95 | End_entity -> "End_entity"
96 | Doctype _ -> "Doctype"
97 | Doctype_rangle _ -> "Doctype_rangle"
98 | Comment_begin -> "Comment_begin"
99 | Comment_end -> "Comment_end"
100 | Comment_material _ -> "Comment_material"
102 | Rangle_empty -> "Rangle_empty"
105 | Dtd_begin _ -> "Dtd_begin"
106 | Dtd_end _ -> "Dtd_end"
107 | Conditional_begin _ -> "Conditional_begin"
108 | Conditional_body _ -> "Conditional_body"
109 | Conditional_end _ -> "Conditional_end"
110 | Percent -> "Percent"
111 | Lparen _ -> "Lparen"
112 | Rparen _ -> "Rparen"
119 | Required -> "Required"
120 | Implied -> "Implied"
122 | Decl_element _ -> "Decl_element"
123 | Decl_attlist _ -> "Decl_attlist"
124 | Decl_entity _ -> "Decl_entity"
125 | Decl_notation _ -> "Decl_notation"
126 | Decl_rangle _ -> "Decl_rangle"
127 | RparenPlus _ -> "RparenPlus"
128 | RparenStar _ -> "RparenStar"
129 | RparenQmark _ -> "RparenQmark"
133 | PI_xml _ -> "PI_xml"
134 | Tag_beg _ -> "Tag_beg"
135 | Tag_end _ -> "Tag_end"
140 | CharData _ -> "CharData"
142 | Nametoken _ -> "Nametoken"
143 | Attval _ -> "Attval"
144 | Attval_nl_normalized _ -> "Attval_nl_normalized"
145 | Unparsed_string _ -> "Unparsed_string"
146 | LineEnd _ -> "LineEnd"
150 { lex_encoding : Pxp_types.rep_encoding;
151 scan_document : Lexing.lexbuf -> (token * lexers);
152 scan_content : Lexing.lexbuf -> (token * lexers);
153 scan_within_tag : Lexing.lexbuf -> (token * lexers);
154 scan_document_type : Lexing.lexbuf -> (token * lexers);
155 scan_declaration : Lexing.lexbuf -> (token * lexers);
156 scan_content_comment : Lexing.lexbuf -> (token * lexers);
157 scan_decl_comment : Lexing.lexbuf -> (token * lexers);
158 scan_document_comment: Lexing.lexbuf -> (token * lexers);
159 scan_ignored_section : Lexing.lexbuf -> (token * lexers);
160 scan_xml_pi : Lexing.lexbuf -> prolog_token;
161 scan_dtd_string : Lexing.lexbuf -> token;
162 scan_content_string : Lexing.lexbuf -> token;
163 scan_name_string : Lexing.lexbuf -> token;
164 scan_only_xml_decl : Lexing.lexbuf -> token;
165 scan_for_crlf : Lexing.lexbuf -> token;
168 (* ======================================================================
172 * Revision 1.1 2000/11/17 09:57:29 lpadovan
175 * Revision 1.2 2000/08/18 20:14:31 gerd
176 * Comment -> Comment_begin, Comment_material, Comment_end.
178 * Revision 1.1 2000/05/29 23:48:38 gerd
179 * Changed module names:
180 * Markup_aux into Pxp_aux
181 * Markup_codewriter into Pxp_codewriter
182 * Markup_document into Pxp_document
183 * Markup_dtd into Pxp_dtd
184 * Markup_entity into Pxp_entity
185 * Markup_lexer_types into Pxp_lexer_types
186 * Markup_reader into Pxp_reader
187 * Markup_types into Pxp_types
188 * Markup_yacc into Pxp_yacc
189 * See directory "compatibility" for (almost) compatible wrappers emulating
190 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
192 * ======================================================================
193 * Old logs from markup_lexer_types.ml:
195 * Revision 1.6 2000/05/29 21:14:57 gerd
196 * Changed the type 'encoding' into a polymorphic variant.
198 * Revision 1.5 2000/05/20 20:31:40 gerd
199 * Big change: Added support for various encodings of the
200 * internal representation.
202 * Revision 1.4 2000/05/14 17:45:36 gerd
205 * Revision 1.3 2000/05/14 17:35:12 gerd
206 * Conditional_begin, _end, and _body have an entity_id.
208 * Revision 1.2 2000/05/08 21:59:06 gerd
209 * New token Bof (beginning of file).
211 * Revision 1.1 2000/05/06 23:21:49 gerd
215 * ======================================================================
217 * DERIVED FROM REVISION 1.4 of markup_lexer_types_shadow.ml
219 * Revision 1.4 2000/04/30 18:19:04 gerd
222 * Revision 1.3 1999/08/31 19:13:31 gerd
223 * Added checks on proper PE nesting. The idea is that tokens such
224 * as Decl_element and Decl_rangle carry an entity ID with them. This ID
225 * is simply an object of type < >, i.e. you can only test on identity.
226 * The lexer always produces tokens with a dummy ID because it does not
227 * know which entity is the current one. The entity layer replaces the dummy
228 * ID with the actual ID. The parser checks that the IDs of pairs such as
229 * Decl_element and Decl_rangle are the same; otherwise a Validation_error
232 * Revision 1.2 1999/08/10 21:35:08 gerd
233 * The XML/encoding declaration at the beginning of entities is
234 * evaluated. In particular, entities have now a method "xml_declaration"
235 * which returns the name/value pairs of such a declaration. The "encoding"
236 * setting is interpreted by the entity itself; "version", and "standalone"
237 * are interpreted by Markup_yacc.parse_document_entity. Other settings
238 * are ignored (this does not conform to the standard; the standard prescribes
239 * that "version" MUST be given in the declaration of document; "standalone"
240 * and "encoding" CAN be declared; no other settings are allowed).
241 * TODO: The user should be warned if the standard is not exactly
242 * fulfilled. -- The "standalone" property is not checked yet.
244 * Revision 1.1 1999/08/10 00:35:51 gerd