1 <!ENTITY markup-yacc.mli '
7 exception ID_not_unique
9 class type [ 'ext ] index =
11 (* The type of indexes over the ID attributes of the elements. This type
12 * is the minimum requirement needed by the parser to create such an index.
14 constraint 'ext = 'ext node #extension
15 method add : string -> 'ext node -> unit
16 (* Add the passed node to the index. If there is already an ID with
17 * the passed string value, the exception ID_not_unique should be
18 * raised. (But the index is free also to accept several identical IDs.)
20 method find : string -> 'ext node
21 (* Finds the node with the passed ID value, or raises Not_found *)
26 class [ 'ext ] hash_index :
28 (* This is a simple implementation of 'index' using a hash table. *)
29 constraint 'ext = 'ext node #extension
30 method add : string -> 'ext node -> unit
32 method find : string -> 'ext node
34 method index : (string, 'ext node) Hashtbl.t
35 (* Returns the hash table. *)
41 { warner : collect_warnings;
42 (* An object that collects warnings. *)
44 errors_with_line_numbers : bool;
45 (* Whether error messages contain line numbers or not. The parser
46 * is 10 to 20 per cent faster if line numbers are turned off;
47 * you get only byte positions in this case.
50 enable_pinstr_nodes : bool;
51 (* true: turns a special mode for processing instructions on. Normally,
52 * you cannot determine the exact location of a PI; you only know
53 * in which element the PI occurs. This mode makes it possible
54 * to find the exact location out: Every PI is artificially wrapped
55 * by a special node with type T_pinstr. For example, if the XML text
56 * is <a><?x?><?y?></a>, the parser normally produces only an element
57 * object for "a", and puts the PIs "x" and "y" into it (without
58 * order). In this mode, the object "a" will contain two objects
59 * with type T_pinstr, and the first object will contain "x", and the
60 * second "y": the object tree looks like
61 * - Node with type = T_element "a"
62 * - Node with type = T_pinstr "x"
63 * + contains processing instruction "x"
64 * - Node with type = T_pinstr "y"
65 * + contains processing instruction "y"
68 * (1) In past versions of PXP this mode was called
69 * processing_instructions_inline, and it produced nodes of
70 * type T_element "-pi" instead of T_pinstr.
71 * (2) The T_pinstr nodes are created from the pinstr exemplars
75 enable_super_root_node : bool;
76 (* true: the topmost element of the XML tree is not the root element,
77 * but the so-called super root. The root element is a son of the
78 * super root. The super root is a node with type T_super_root.
79 * The following behaviour changes, too:
80 * - PIs occurring outside the root element and outside the DTD are
81 * added to the super root instead of the document object
82 * - If enable_pinstr_nodes is also turned on, the PI wrappers
83 * are added to the super root
85 * For example, the document
86 * <?x?><a>y</a><?y?>
87 * is normally represented by:
89 * + contains PIs x and y
90 * - reference to root node with type = T_element "a"
91 * - node with type = T_data: contains "y"
92 * With enabled super root node:
94 * - reference to super root node with type = T_super_root
95 * + contains PIs x and y
96 * - root node with type = T_element "a"
97 * - node with type = T_data: contains "y"
98 * If also enable_pinstr_nodes:
100 * - reference to super root node with type = T_super_root
101 * - node with type = T_pinstr "x"
103 * - root node with type = T_element "a"
104 * - node with type = T_data: contains "y"
105 * - node with type = T_pinstr "y"
108 * (1) In previous versions of PXP this mode was called
109 * virtual_root, and it produced an additional node of type
110 * T_element "-vr" instead of T_super_root.
111 * (2) The T_super_root node is created from the super root exemplar
115 enable_comment_nodes : bool;
116 (* When enabled, comments are represented as nodes with type =
118 * To access the contents of comments, use the method "comment"
119 * for the comment nodes.
120 * These nodes behave like elements; however, they are normally
121 * empty and do not have attributes. Note that it is possible to
122 * add children to comment nodes and to set attributes, but it is
123 * strongly recommended not to do so. There are no checks on
124 * such abnormal use, because they would cost too
125 * much time, even when no comment nodes are generated at all.
127 * Comment nodes should be disabled unless you must parse a
128 * third-party XML text which uses comments as another data
131 * The nodes of type T_comment are created from the comment exemplars
135 encoding : rep_encoding;
136 (* Specifies the encoding used for the *internal* representation
137 * of any character data.
138 * Note that the default is still Enc_iso88591.
141 recognize_standalone_declaration : bool;
142 (* Whether the "standalone" declaration is recognized or not.
143 * This option does not have an effect on well-formedness parsing:
144 * in this case such declarations are never recognized.
146 * Recognizing the "standalone" declaration means that the
147 * value of the declaration is scanned and passed to the DTD,
148 * and that the "standalone-check" is performed.
150 * Standalone-check: If a document is flagged standalone='yes'
151 * some additional constraints apply. The idea is that a parser
152 * without access to any external document subsets can still parse
153 * the document, and will still return the same values as the parser
154 * with such access. For example, if the DTD is external and if
155 * there are attributes with default values, it is checked that there
156 * is no element instance where these attributes are omitted - the
157 * parser would return the default value but this requires access to
158 * the external DTD subset.
161 store_element_positions : bool;
162 (* Whether the file name, the line and the column of the
163 * beginning of elements are stored in the element nodes.
164 * This option may be useful to generate error messages.
166 * Positions are only stored for:
168 * - Wrapped processing instructions (see enable_pinstr_nodes)
169 * For all other node types, no position is stored.
171 * You can access positions by the method "position" of nodes.
175 (* Whether the parser does a second pass and checks that all
176 * IDREF and IDREFS attributes contain valid references.
177 * This option works only if an ID index is available. To create
178 * an ID index, pass an index object as id_index argument to the
179 * parsing functions (such as parse_document_entity; see below).
181 * "Second pass" does not mean that the XML text is again parsed;
182 * only the existing document tree is traversed, and the check
183 * on bad IDREF/IDREFS attributes is performed for every node.
186 validate_by_dfa : bool;
187 (* If true, and if DFAs are available for validation, the DFAs will
188 * actually be used for validation.
189 * If false, or if no DFAs are available, the standard backtracking
190 * algorithm will be used.
191 * DFA = deterministic finite automaton.
193 * DFAs are only available if accept_only_deterministic_models is
194 * "true" (because in this case, it is relatively cheap to construct
195 * the DFAs). DFAs are a data structure which ensures that validation
196 * can always be performed in linear time.
198 * I strongly recommend using DFAs; however, there are examples
199 * for which validation by backtracking is faster.
202 accept_only_deterministic_models : bool;
203 (* Whether only deterministic content models are accepted in DTDs. *)
205 (* The following options are not implemented, or only for internal
209 debugging_mode : bool;
214 Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
215 | ExtID of (ext_id * Pxp_reader.resolver)
218 ?system_encoding:encoding -> ?id:ext_id -> ?fixenc:encoding ->
219 in_channel -> source
222 ?fixenc:encoding -> string -> source
225 ?system_encoding:encoding -> string -> source
227 (* Notes on sources (version 2):
229 * Sources specify where the XML text to parse comes from. Sources not only
230 * represent character streams, but also external IDs (i.e. SYSTEM or PUBLIC
231 * names), and they are interpreted as a specific encoding of characters.
232 * A source should be associated with an external ID, because otherwise
233 * it is not known how to handle relative names.
235 * There are two primary sources, Entity and ExtID, and several functions
236 * for derived sources. First explanations for the functions:
238 * from_channel: The XML text is read from an in_channel. By default, the
239 * channel is not associated with an external ID, and it is impossible
240 * to resolve relative SYSTEM IDs found in the document.
241 * If the ?id argument is passed, it is assumed that the channel has this
242 * external ID. If relative SYSTEM IDs occur in the document, they can
243 * be interpreted; however, it is only possible to read from "file:"
245 * By default, the channel automatically detects the encoding. You can
246 * set a fixed encoding by passing the ?fixenc argument.
248 * from_string: The XML text is read from a string.
249 * It is impossible to read from any external entity whose reference is found
251 * By default, the encoding of the string is detected automatically. You can
252 * set a fixed encoding by passing the ?fixenc argument.
254 * from_file: The XML text is read from the file whose file name is
255 * passed to the function (as UTF-8 string).
256 * Relative system IDs can be interpreted by this function.
257 * The ?system_encoding argument specifies the character encoding used
258 * for file names (sic!). By default, UTF-8 is assumed.
262 * from_file "/tmp/file.xml":
263 * reads from this file, which is assumed to have the ID
264 * SYSTEM "file://localhost/tmp/file.xml".
266 * let ch = open_in "/tmp/file.xml" in
267 * from_channel ~id:(System "file://localhost/tmp/file.xml") ch
268 * This does the same, but uses a channel.
270 * from_channel ~id:(System "http://host/file.xml")
272 * reads from the channel ch, and it is assumed that the ID is
273 * SYSTEM "http://host/file.xml". If there is any relative SYSTEM ID,
274 * it will be interpreted relative to this location; however, there is
275 * no way to read via HTTP.
276 * If there is any "file:" SYSTEM ID, it is possible to read the file.
278 * The primary sources:
280 * - ExtID(x,r): The identifier x (either the SYSTEM or the PUBLIC name) of the
281 * entity to read from is passed to the resolver, and the resolver finds
282 * the entity and opens it.
283 * The intention of this option is to allow customized
284 * resolvers to interpret external identifiers without any restriction.
285 * The Pxp_reader module contains several classes allowing the user to
286 * compose such a customized resolver from predefined components.
288 * ExtID is the interface of choice for own extensions to resolvers.
290 * - Entity(m,r): You can implementy every behaviour by using a customized
291 * entity class. Once the DTD object d is known that will be used during
292 * parsing, the entity e = m d is determined and used together with the
294 * This is only for hackers.
299 val default_config : config
300 (* - Warnings are thrown away
301 * - Error messages will contain line numbers
302 * - Neither T_super_root nor T_pinstr nor T_comment nodes are generated
303 * - The internal encoding is ISO-8859-1
304 * - The standalone declaration is checked
305 * - Element positions are stored
306 * - The IDREF pass is left out
307 * - If available, DFAs are used for validation
308 * - Only deterministic content models are accepted
311 val default_extension : ('a node extension) as 'a
312 (* A "null" extension; an extension that does not extend the functionality *)
314 val default_spec : ('a node extension as 'a) spec
315 (* Specifies that you do not want to use extensions. *)
317 val parse_dtd_entity : config -> source -> dtd
318 (* Parse an entity containing a DTD (external subset), and return this DTD. *)
320 val extract_dtd_from_document_entity : config -> source -> dtd
321 (* Parses a closed document, i.e. a document beginning with <!DOCTYPE...>,
322 * and returns the DTD contained in the document.
323 * The parts of the document outside the DTD are actually not parsed,
324 * i.e. parsing stops when all declarations of the DTD have been read.
327 val parse_document_entity :
328 ?transform_dtd:(dtd -> dtd) ->
329 ?id_index:('ext index) ->
330 config -> source -> 'ext spec -> 'ext document
331 (* Parse a closed document, i.e. a document beginning with <!DOCTYPE...>,
332 * and validate the contents of the document against the DTD contained
333 * and/or referenced in the document.
335 * If the optional argument ~transform_dtd is passed, the following
336 * modification applies: After the DTD (both the internal and external
337 * subsets) has been parsed, the function ~transform_dtd is called,
338 * and the resulting DTD is actually used to validate the document.
340 * If the optional argument ~transform_dtd is missing, the parser
341 * behaves in the same way as if the identity were passed as ~transform_dtd.
343 * If the optional argument ~id_index is present, the parser adds
344 * any ID attribute to the passed index. An index is required to detect
345 * violations of the uniqueness of IDs.
348 val parse_wfdocument_entity :
349 config -> source -> 'ext spec -> 'ext document
350 (* Parse a closed document (see parse_document_entity), but do not
351 * validate it. Only checks on well-formedness are performed.
354 val parse_content_entity :
355 ?id_index:('ext index) ->
356 config -> source -> dtd -> 'ext spec -> 'ext node
357 (* Parse a file representing a well-formed fragment of a document. The
358 * fragment must be a single element (i.e. something like <a>...</a>;
359 * not a sequence like <a>...</a><b>...</b>). The element is validated
360 * against the passed DTD, but it is not checked whether the element is
361 * the root element specified in the DTD.
363 * If the optional argument ~id_index is present, the parser adds
364 * any ID attribute to the passed index. An index is required to detect
365 * violations of the uniqueness of IDs.
368 val parse_wfcontent_entity :
369 config -> source -> 'ext spec -> 'ext node
370 (* Parse a file representing a well-formed fragment of a document
371 * (see parse_content_entity). The fragment is not validated, only
372 * checked for well-formedness.