2 * ----------------------------------------------------------------------
3 * PXP: The polymorphic XML parser for Objective Caml.
4 * Copyright by Gerd Stolpmann. See LICENSE for details.
14 exception ID_not_unique
16 class type [ 'ext ] index =
18 (* The type of indexes over the ID attributes of the elements. This type
19 * is the minimum requirement needed by the parser to create such an index.
21 constraint 'ext = 'ext node #extension
22 method add : string -> 'ext node -> unit
23 (* Add the passed node to the index. If there is already an ID with
24 * the passed string value, the exception ID_not_unique should be
25 * raised. (But the index is free also to accept several identical IDs.)
27 method find : string -> 'ext node
28 (* Finds the node with the passed ID value, or raises Not_found *)
33 class [ 'ext ] hash_index :
35 (* This is a simple implementation of 'index' using a hash table. *)
36 constraint 'ext = 'ext node #extension
37 method add : string -> 'ext node -> unit
39 method find : string -> 'ext node
41 method index : (string, 'ext node) Hashtbl.t
42 (* Returns the hash table. *)
48 { warner : collect_warnings;
49 (* An object that collects warnings. *)
51 errors_with_line_numbers : bool;
52 (* Whether error messages contain line numbers or not. The parser
53 * is 10 to 20 per cent faster if line numbers are turned off;
54 * you get only byte positions in this case.
57 enable_pinstr_nodes : bool;
58 (* true: turns a special mode for processing instructions on. Normally,
59 * you cannot determine the exact location of a PI; you only know
60 * in which element the PI occurs. This mode makes it possible
61 * to find the exact location out: Every PI is artificially wrapped
62 * by a special node with type T_pinstr. For example, if the XML text
63 * is <a><?x?><?y?></a>, the parser normally produces only an element
64 * object for "a", and puts the PIs "x" and "y" into it (without
65 * order). In this mode, the object "a" will contain two objects
66 * with type T_pinstr, and the first object will contain "x", and the
67 * second "y": the object tree looks like
68 * - Node with type = T_element "a"
69 * - Node with type = T_pinstr "x"
70 * + contains processing instruction "x"
71 * - Node with type = T_pinstr "y"
72 * + contains processing instruction "y"
75 * (1) In past versions of PXP this mode was called
76 * processing_instructions_inline, and it produced nodes of
77 * type T_element "-pi" instead of T_pinstr.
78 * (2) The T_pinstr nodes are created from the pinstr exemplars
82 enable_super_root_node : bool;
83 (* true: the topmost element of the XML tree is not the root element,
84 * but the so-called super root. The root element is a son of the
85 * super root. The super root is a node with type T_super_root.
86 * The following behaviour changes, too:
87 * - PIs occurring outside the root element and outside the DTD are
88 * added to the super root instead of the document object
89 * - If enable_pinstr_nodes is also turned on, the PI wrappers
90 * are added to the super root
92 * For example, the document
94 * is normally represented by:
96 * + contains PIs x and y
97 * - reference to root node with type = T_element "a"
98 * - node with type = T_data: contains "y"
99 * With enabled super root node:
101 * - reference to super root node with type = T_super_root
102 * + contains PIs x and y
103 * - root node with type = T_element "a"
104 * - node with type = T_data: contains "y"
105 * If also enable_pinstr_nodes:
107 * - reference to super root node with type = T_super_root
108 * - node with type = T_pinstr "x"
110 * - root node with type = T_element "a"
111 * - node with type = T_data: contains "y"
112 * - node with type = T_pinstr "y"
115 * (1) In previous versions of PXP this mode was called
116 * virtual_root, and it produced an additional node of type
117 * T_element "-vr" instead of T_super_root.
118 * (2) The T_super_root node is created from the super root exemplar
122 enable_comment_nodes : bool;
123 (* When enabled, comments are represented as nodes with type =
125 * To access the contents of comments, use the method "comment"
126 * for the comment nodes.
127 * These nodes behave like elements; however, they are normally
128 * empty and do not have attributes. Note that it is possible to
129 * add children to comment nodes and to set attributes, but it is
130 * strongly recommended not to do so. There are no checks on
131 * such abnormal use, because they would cost too
132 * much time, even when no comment nodes are generated at all.
134 * Comment nodes should be disabled unless you must parse a
135 * third-party XML text which uses comments as another data
138 * The nodes of type T_comment are created from the comment exemplars
142 encoding : rep_encoding;
143 (* Specifies the encoding used for the *internal* representation
144 * of any character data.
145 * Note that the default is still Enc_iso88591.
148 recognize_standalone_declaration : bool;
149 (* Whether the "standalone" declaration is recognized or not.
150 * This option does not have an effect on well-formedness parsing:
151 * in this case such declarations are never recognized.
153 * Recognizing the "standalone" declaration means that the
154 * value of the declaration is scanned and passed to the DTD,
155 * and that the "standalone-check" is performed.
157 * Standalone-check: If a document is flagged standalone='yes'
158 * some additional constraints apply. The idea is that a parser
159 * without access to any external document subsets can still parse
160 * the document, and will still return the same values as the parser
161 * with such access. For example, if the DTD is external and if
162 * there are attributes with default values, it is checked that there
163 * is no element instance where these attributes are omitted - the
164 * parser would return the default value but this requires access to
165 * the external DTD subset.
168 store_element_positions : bool;
169 (* Whether the file name, the line and the column of the
170 * beginning of elements are stored in the element nodes.
171 * This option may be useful to generate error messages.
173 * Positions are only stored for:
175 * - Wrapped processing instructions (see enable_pinstr_nodes)
176 * For all other node types, no position is stored.
178 * You can access positions by the method "position" of nodes.
182 (* Whether the parser does a second pass and checks that all
183 * IDREF and IDREFS attributes contain valid references.
184 * This option works only if an ID index is available. To create
185 * an ID index, pass an index object as id_index argument to the
186 * parsing functions (such as parse_document_entity; see below).
188 * "Second pass" does not mean that the XML text is again parsed;
189 * only the existing document tree is traversed, and the check
190 * on bad IDREF/IDREFS attributes is performed for every node.
193 validate_by_dfa : bool;
194 (* If true, and if DFAs are available for validation, the DFAs will
195 * actually be used for validation.
196 * If false, or if no DFAs are available, the standard backtracking
197 * algorithm will be used.
198 * DFA = deterministic finite automaton.
200 * DFAs are only available if accept_only_deterministic_models is
201 * "true" (because in this case, it is relatively cheap to construct
202 * the DFAs). DFAs are a data structure which ensures that validation
203 * can always be performed in linear time.
205 * I strongly recommend using DFAs; however, there are examples
206 * for which validation by backtracking is faster.
209 accept_only_deterministic_models : bool;
210 (* Whether only deterministic content models are accepted in DTDs. *)
212 (* The following options are not implemented, or only for internal
216 debugging_mode : bool;
221 Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
222 | ExtID of (ext_id * Pxp_reader.resolver)
225 ?system_encoding:encoding -> ?id:ext_id -> ?fixenc:encoding ->
229 ?fixenc:encoding -> string -> source
232 ?system_encoding:encoding -> string -> source
234 (* Notes on sources (version 2):
236 * Sources specify where the XML text to parse comes from. Sources not only
237 * represent character streams, but also external IDs (i.e. SYSTEM or PUBLIC
238 * names), and they are interpreted as a specific encoding of characters.
239 * A source should be associated with an external ID, because otherwise
240 * it is not known how to handle relative names.
242 * There are two primary sources, Entity and ExtID, and several functions
243 * for derived sources. First explanations for the functions:
245 * from_channel: The XML text is read from an in_channel. By default, the
246 * channel is not associated with an external ID, and it is impossible
247 * to resolve relative SYSTEM IDs found in the document.
248 * If the ?id argument is passed, it is assumed that the channel has this
249 * external ID. If relative SYSTEM IDs occur in the document, they can
250 * be interpreted; however, it is only possible to read from "file:"
252 * By default, the channel automatically detects the encoding. You can
253 * set a fixed encoding by passing the ?fixenc argument.
255 * from_string: The XML text is read from a string.
256 * It is impossible to read from any external entity whose reference is found
258 * By default, the encoding of the string is detected automatically. You can
259 * set a fixed encoding by passing the ?fixenc argument.
261 * from_file: The XML text is read from the file whose file name is
262 * passed to the function (as UTF-8 string).
263 * Relative system IDs can be interpreted by this function.
264 * The ?system_encoding argument specifies the character encoding used
265 * for file names (sic!). By default, UTF-8 is assumed.
269 * from_file "/tmp/file.xml":
270 * reads from this file, which is assumed to have the ID
271 * SYSTEM "file://localhost/tmp/file.xml".
273 * let ch = open_in "/tmp/file.xml" in
274 * from_channel ~id:(System "file://localhost/tmp/file.xml") ch
275 * This does the same, but uses a channel.
277 * from_channel ~id:(System "http://host/file.xml")
279 * reads from the channel ch, and it is assumed that the ID is
280 * SYSTEM "http://host/file.xml". If there is any relative SYSTEM ID,
281 * it will be interpreted relative to this location; however, there is
282 * no way to read via HTTP.
283 * If there is any "file:" SYSTEM ID, it is possible to read the file.
285 * The primary sources:
287 * - ExtID(x,r): The identifier x (either the SYSTEM or the PUBLIC name) of the
288 * entity to read from is passed to the resolver, and the resolver finds
289 * the entity and opens it.
290 * The intention of this option is to allow customized
291 * resolvers to interpret external identifiers without any restriction.
292 * The Pxp_reader module contains several classes allowing the user to
293 * compose such a customized resolver from predefined components.
295 * ExtID is the interface of choice for own extensions to resolvers.
297 * - Entity(m,r): You can implementy every behaviour by using a customized
298 * entity class. Once the DTD object d is known that will be used during
299 * parsing, the entity e = m d is determined and used together with the
301 * This is only for hackers.
306 val default_config : config
307 (* - Warnings are thrown away
308 * - Error messages will contain line numbers
309 * - Neither T_super_root nor T_pinstr nor T_comment nodes are generated
310 * - The internal encoding is ISO-8859-1
311 * - The standalone declaration is checked
312 * - Element positions are stored
313 * - The IDREF pass is left out
314 * - If available, DFAs are used for validation
315 * - Only deterministic content models are accepted
318 val default_extension : ('a node extension) as 'a
319 (* A "null" extension; an extension that does not extend the functionality *)
321 val default_spec : ('a node extension as 'a) spec
322 (* Specifies that you do not want to use extensions. *)
324 val parse_dtd_entity : config -> source -> dtd
325 (* Parse an entity containing a DTD (external subset), and return this DTD. *)
327 val extract_dtd_from_document_entity : config -> source -> dtd
328 (* Parses a closed document, i.e. a document beginning with <!DOCTYPE...>,
329 * and returns the DTD contained in the document.
330 * The parts of the document outside the DTD are actually not parsed,
331 * i.e. parsing stops when all declarations of the DTD have been read.
334 val parse_document_entity :
335 ?transform_dtd:(dtd -> dtd) ->
336 ?id_index:('ext index) ->
337 config -> source -> 'ext spec -> 'ext document
338 (* Parse a closed document, i.e. a document beginning with <!DOCTYPE...>,
339 * and validate the contents of the document against the DTD contained
340 * and/or referenced in the document.
342 * If the optional argument ~transform_dtd is passed, the following
343 * modification applies: After the DTD (both the internal and external
344 * subsets) has been parsed, the function ~transform_dtd is called,
345 * and the resulting DTD is actually used to validate the document.
347 * If the optional argument ~transform_dtd is missing, the parser
348 * behaves in the same way as if the identity were passed as ~transform_dtd.
350 * If the optional argument ~id_index is present, the parser adds
351 * any ID attribute to the passed index. An index is required to detect
352 * violations of the uniqueness of IDs.
355 val parse_wfdocument_entity :
356 config -> source -> 'ext spec -> 'ext document
357 (* Parse a closed document (see parse_document_entity), but do not
358 * validate it. Only checks on well-formedness are performed.
361 val parse_content_entity :
362 ?id_index:('ext index) ->
363 config -> source -> dtd -> 'ext spec -> 'ext node
364 (* Parse a file representing a well-formed fragment of a document. The
365 * fragment must be a single element (i.e. something like <a>...</a>;
366 * not a sequence like <a>...</a><b>...</b>). The element is validated
367 * against the passed DTD, but it is not checked whether the element is
368 * the root element specified in the DTD.
370 * If the optional argument ~id_index is present, the parser adds
371 * any ID attribute to the passed index. An index is required to detect
372 * violations of the uniqueness of IDs.
375 val parse_wfcontent_entity :
376 config -> source -> 'ext spec -> 'ext node
377 (* Parse a file representing a well-formed fragment of a document
378 * (see parse_content_entity). The fragment is not validated, only
379 * checked for well-formedness.
386 (* ======================================================================
390 * Revision 1.1 2000/11/17 09:57:30 lpadovan
393 * Revision 1.7 2000/08/18 20:15:43 gerd
395 * - enable_super_root_nodes: new name for virtual_root
396 * - enable_pinstr_nodes: new name for processing_instructions_inline
397 * - enable_comment_nodes: new option
398 * Updated comments for various options.
400 * Revision 1.6 2000/07/23 02:16:33 gerd
403 * Revision 1.5 2000/07/14 13:57:29 gerd
404 * Added the id_index feature.
406 * Revision 1.4 2000/07/09 17:52:54 gerd
407 * New option store_element_positions.
409 * Revision 1.3 2000/07/08 16:26:21 gerd
410 * Added the signatures of the functions
411 * 'extract_dtd_from_document_entity' and 'parse_wfcontent_entity'.
412 * Updated the signature of 'parse_document_entity': New optional
413 * argument 'transform_dtd'.
414 * Updated the comments.
416 * Revision 1.2 2000/07/04 22:09:03 gerd
417 * MAJOR CHANGE: Redesign of the interface (not yet complete).
419 * Revision 1.1 2000/05/29 23:48:38 gerd
420 * Changed module names:
421 * Markup_aux into Pxp_aux
422 * Markup_codewriter into Pxp_codewriter
423 * Markup_document into Pxp_document
424 * Markup_dtd into Pxp_dtd
425 * Markup_entity into Pxp_entity
426 * Markup_lexer_types into Pxp_lexer_types
427 * Markup_reader into Pxp_reader
428 * Markup_types into Pxp_types
429 * Markup_yacc into Pxp_yacc
430 * See directory "compatibility" for (almost) compatible wrappers emulating
431 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
433 * ======================================================================
434 * Old logs from markup_yacc.mli:
436 * Revision 1.4 2000/05/29 21:14:57 gerd
437 * Changed the type 'encoding' into a polymorphic variant.
439 * Revision 1.3 2000/05/27 19:24:01 gerd
440 * New option: recognize_standalone_declaration.
442 * Revision 1.2 2000/05/20 20:31:40 gerd
443 * Big change: Added support for various encodings of the
444 * internal representation.
446 * Revision 1.1 2000/05/06 23:21:49 gerd
449 * Revision 1.9 2000/04/30 18:23:38 gerd
450 * New config options 'processing_instructions_inline' and
453 * Revision 1.8 2000/03/13 23:46:46 gerd
454 * Change: The 'resolver' component of the 'config' type has
455 * disappeared. Instead, there is a new resolver component in the Entity
456 * and ExtID values of 'source'. I hope that this makes clearer that the
457 * resolver has only an effect if used together with Entity and ExtID
459 * Change: The Entity value can now return the entity dependent
460 * on the DTD that is going to be used.
462 * Revision 1.7 2000/02/22 02:32:02 gerd
465 * Revision 1.6 2000/02/22 01:52:45 gerd
466 * Added documentation.
468 * Revision 1.5 2000/01/20 20:54:43 gerd
469 * New config.errors_with_line_numbers.
471 * Revision 1.4 1999/09/01 23:09:10 gerd
472 * New function parse_wf_entity that simulates a well-formedness
475 * Revision 1.3 1999/09/01 16:26:36 gerd
476 * Added an empty line. This is *really* a big change.
478 * Revision 1.2 1999/08/14 22:20:27 gerd
479 * The "config" slot has now a component "warner"which is
480 * an object with a "warn" method. This is used to warn about characters
481 * that cannot be represented in the Latin 1 alphabet.
482 * Furthermore, there is a new component "debugging_mode".
484 * Revision 1.1 1999/08/10 00:35:52 gerd