helm/DEVEL/pxp/pxp/doc/manual/src/yacc.mli.ent

   1 <!ENTITY markup-yacc.mli '
   2
   3 open Pxp_types
   4 open Pxp_dtd
   5 open Pxp_document
   6
   7 exception ID_not_unique
   8
   9 class type [ &apos;ext ] index =
  10 object
  11   (* The type of indexes over the ID attributes of the elements. This type
  12    * is the minimum requirement needed by the parser to create such an index.
  13    *)
  14   constraint &apos;ext = &apos;ext node #extension
  15   method add : string -&gt; &apos;ext node -&gt; unit
  16     (* Add the passed node to the index. If there is already an ID with
  17      * the passed string value, the exception ID_not_unique should be
  18      * raised. (But the index is free also to accept several identical IDs.)
  19      *)
  20   method find : string -&gt; &apos;ext node
  21     (* Finds the node with the passed ID value, or raises Not_found *)
  22 end
  23 ;;
  24
  25
  26 class [ &apos;ext ] hash_index :
  27 object
  28   (* This is a simple implementation of &apos;index&apos; using a hash table. *)
  29   constraint &apos;ext = &apos;ext node #extension
  30   method add : string -&gt; &apos;ext node -&gt; unit
  31     (* See above. *)
  32   method find : string -&gt; &apos;ext node
  33     (* See above. *)
  34   method index : (string, &apos;ext node) Hashtbl.t
  35     (* Returns the hash table. *)
  36 end
  37 ;;
  38
  39
  40 type config =
  41     { warner : collect_warnings;
  42          (* An object that collects warnings. *)
  43
  44       errors_with_line_numbers : bool;
  45          (* Whether error messages contain line numbers or not. The parser
  46           * is 10 to 20 per cent faster if line numbers are turned off;
  47           * you get only byte positions in this case.
  48           *)
  49
  50       enable_pinstr_nodes : bool;
  51          (* true: turns a special mode for processing instructions on. Normally,
  52           * you cannot determine the exact location of a PI; you only know
  53           * in which element the PI occurs. This mode makes it possible
  54           * to find the exact location out: Every PI is artificially wrapped
  55           * by a special node with type T_pinstr. For example, if the XML text
  56           * is &lt;a&gt;&lt;?x?&gt;&lt;?y?&gt;&lt;/a&gt;, the parser normally produces only an element
  57           * object for "a", and puts the PIs "x" and "y" into it (without
  58           * order). In this mode, the object "a" will contain two objects
  59           * with type T_pinstr, and the first object will contain "x", and the
  60           * second "y": the object tree looks like
  61           * - Node with type = T_element "a"
  62           *   - Node with type = T_pinstr "x"
  63           *     + contains processing instruction "x"
  64           *   - Node with type = T_pinstr "y"
  65           *     + contains processing instruction "y"
  66           *
  67           * Notes:
  68           * (1) In past versions of PXP this mode was called
  69           *     processing_instructions_inline, and it produced nodes of
  70           *     type T_element "-pi" instead of T_pinstr.
  71           * (2) The T_pinstr nodes are created from the pinstr exemplars
  72           *     in your spec
  73           *)
  74
  75       enable_super_root_node : bool;
  76          (* true: the topmost element of the XML tree is not the root element,
  77           * but the so-called super root. The root element is a son of the
  78           * super root. The super root is a node with type T_super_root.
  79           * The following behaviour changes, too:
  80           * - PIs occurring outside the root element and outside the DTD are
  81           *   added to the super root instead of the document object
  82           * - If enable_pinstr_nodes is also turned on, the PI wrappers
  83           *   are added to the super root
  84           *
  85           * For example, the document
  86           *   &lt;?x?&gt;&lt;a&gt;y&lt;/a&gt;&lt;?y?&gt;
  87           * is normally represented by:
  88           * - document object
  89           *   + contains PIs x and y
  90           *   - reference to root node with type = T_element "a"
  91           *     - node with type = T_data: contains "y"
  92           * With enabled super root node:
  93           * - document object
  94           *   - reference to super root node with type = T_super_root
  95           *     + contains PIs x and y
  96           *     - root node with type = T_element "a"
  97           *       - node with type = T_data: contains "y"
  98           * If also enable_pinstr_nodes:
  99           * - document object
 100           *   - reference to super root node with type = T_super_root
 101           *     - node with type = T_pinstr "x"
 102           *       + contains PI "x"
 103           *     - root node with type = T_element "a"
 104           *       - node with type = T_data: contains "y"
 105           *     - node with type = T_pinstr "y"
 106           *       + contains PI "y"
 107           * Notes:
 108           * (1) In previous versions of PXP this mode was called
 109           *     virtual_root, and it produced an additional node of type
 110           *     T_element "-vr" instead of T_super_root.
 111           * (2) The T_super_root node is created from the super root exemplar
 112           *     in your spec.
 113           *)
 114
 115       enable_comment_nodes : bool;
 116          (* When enabled, comments are represented as nodes with type =
 117           * T_comment.
 118           * To access the contents of comments, use the method "comment"
 119           * for the comment nodes.
 120           * These nodes behave like elements; however, they are normally
 121           * empty and do not have attributes. Note that it is possible to
 122           * add children to comment nodes and to set attributes, but it is
 123           * strongly recommended not to do so. There are no checks on
 124           * such abnormal use, because they would cost too
 125           * much time, even when no comment nodes are generated at all.
 126           *
 127           * Comment nodes should be disabled unless you must parse a
 128           * third-party XML text which uses comments as another data
 129           * container.
 130           *
 131           * The nodes of type T_comment are created from the comment exemplars
 132           * in your spec.
 133           *)
 134
 135       encoding : rep_encoding;
 136         (* Specifies the encoding used for the *internal* representation
 137          * of any character data.
 138          * Note that the default is still Enc_iso88591.
 139          *)
 140
 141       recognize_standalone_declaration : bool;
 142         (* Whether the "standalone" declaration is recognized or not.
 143          * This option does not have an effect on well-formedness parsing:
 144          * in this case such declarations are never recognized.
 145          *
 146          * Recognizing the "standalone" declaration means that the
 147          * value of the declaration is scanned and passed to the DTD,
 148          * and that the "standalone-check" is performed.
 149          *
 150          * Standalone-check: If a document is flagged standalone=&apos;yes&apos;
 151          * some additional constraints apply. The idea is that a parser
 152          * without access to any external document subsets can still parse
 153          * the document, and will still return the same values as the parser
 154          * with such access. For example, if the DTD is external and if
 155          * there are attributes with default values, it is checked that there
 156          * is no element instance where these attributes are omitted - the
 157          * parser would return the default value but this requires access to
 158          * the external DTD subset.
 159          *)
 160
 161       store_element_positions : bool;
 162         (* Whether the file name, the line and the column of the
 163          * beginning of elements are stored in the element nodes.
 164          * This option may be useful to generate error messages.
 165          *
 166          * Positions are only stored for:
 167          * - Elements
 168          * - Wrapped processing instructions (see enable_pinstr_nodes)
 169          * For all other node types, no position is stored.
 170          *
 171          * You can access positions by the method "position" of nodes.
 172          *)
 173
 174       idref_pass : bool;
 175         (* Whether the parser does a second pass and checks that all
 176          * IDREF and IDREFS attributes contain valid references.
 177          * This option works only if an ID index is available. To create
 178          * an ID index, pass an index object as id_index argument to the
 179          * parsing functions (such as parse_document_entity; see below).
 180          *
 181          * "Second pass" does not mean that the XML text is again parsed;
 182          * only the existing document tree is traversed, and the check
 183          * on bad IDREF/IDREFS attributes is performed for every node.
 184          *)
 185
 186       validate_by_dfa : bool;
 187         (* If true, and if DFAs are available for validation, the DFAs will
 188          * actually be used for validation.
 189          * If false, or if no DFAs are available, the standard backtracking
 190          * algorithm will be used.
 191          * DFA = deterministic finite automaton.
 192          *
 193          * DFAs are only available if accept_only_deterministic_models is
 194          * "true" (because in this case, it is relatively cheap to construct
 195          * the DFAs). DFAs are a data structure which ensures that validation
 196          * can always be performed in linear time.
 197          *
 198          * I strongly recommend using DFAs; however, there are examples
 199          * for which validation by backtracking is faster.
 200          *)
 201
 202       accept_only_deterministic_models : bool;
 203         (* Whether only deterministic content models are accepted in DTDs. *)
 204
 205       (* The following options are not implemented, or only for internal
 206        * use.
 207        *)
 208
 209       debugging_mode : bool;
 210     }
 211
 212
 213 type source =
 214     Entity of ((dtd -&gt; Pxp_entity.entity) * Pxp_reader.resolver)
 215   | ExtID of (ext_id * Pxp_reader.resolver)
 216
 217 val from_channel :
 218       ?system_encoding:encoding -&gt; ?id:ext_id -&gt; ?fixenc:encoding -&gt;
 219       in_channel -&gt; source
 220
 221 val from_string :
 222       ?fixenc:encoding -&gt; string -&gt; source
 223
 224 val from_file :
 225       ?system_encoding:encoding -&gt; string -&gt; source
 226
 227 (* Notes on sources (version 2):
 228  *
 229  * Sources specify where the XML text to parse comes from. Sources not only
 230  * represent character streams, but also external IDs (i.e. SYSTEM or PUBLIC
 231  * names), and they are interpreted as a specific encoding of characters.
 232  * A source should be associated with an external ID, because otherwise
 233  * it is not known how to handle relative names.
 234  *
 235  * There are two primary sources, Entity and ExtID, and several functions
 236  * for derived sources. First explanations for the functions:
 237  *
 238  * from_channel: The XML text is read from an in_channel. By default, the
 239  *   channel is not associated with an external ID, and it is impossible
 240  *   to resolve relative SYSTEM IDs found in the document.
 241  *   If the ?id argument is passed, it is assumed that the channel has this
 242  *   external ID. If relative SYSTEM IDs occur in the document, they can
 243  *   be interpreted; however, it is only possible to read from "file:"
 244  *   IDs.
 245  *   By default, the channel automatically detects the encoding. You can
 246  *   set a fixed encoding by passing the ?fixenc argument.
 247  *
 248  * from_string: The XML text is read from a string.
 249  *   It is impossible to read from any external entity whose reference is found
 250  *   in the string.
 251  *   By default, the encoding of the string is detected automatically. You can
 252  *   set a fixed encoding by passing the ?fixenc argument.
 253  *
 254  * from_file: The XML text is read from the file whose file name is
 255  *   passed to the function (as UTF-8 string).
 256  *   Relative system IDs can be interpreted by this function.
 257  *   The ?system_encoding argument specifies the character encoding used
 258  *   for file names (sic!). By default, UTF-8 is assumed.
 259  *
 260  * Examples:
 261  *
 262  * from_file "/tmp/file.xml":
 263  *   reads from this file, which is assumed to have the ID
 264  *   SYSTEM "file://localhost/tmp/file.xml".
 265  *
 266  * let ch = open_in "/tmp/file.xml" in
 267  * from_channel ~id:(System "file://localhost/tmp/file.xml") ch
 268  *   This does the same, but uses a channel.
 269  *
 270  * from_channel ~id:(System "http://host/file.xml")
 271  *              ch
 272  *   reads from the channel ch, and it is assumed that the ID is
 273  *   SYSTEM "http://host/file.xml". If there is any relative SYSTEM ID,
 274  *   it will be interpreted relative to this location; however, there is
 275  *   no way to read via HTTP.
 276  *   If there is any "file:" SYSTEM ID, it is possible to read the file.
 277  *
 278  * The primary sources:
 279  *
 280  * - ExtID(x,r): The identifier x (either the SYSTEM or the PUBLIC name) of the
 281  *   entity to read from is passed to the resolver, and the resolver finds
 282  *   the entity and opens it.
 283  *   The intention of this option is to allow customized
 284  *   resolvers to interpret external identifiers without any restriction.
 285  *   The Pxp_reader module contains several classes allowing the user to
 286  *   compose such a customized resolver from predefined components.
 287  *
 288  *   ExtID is the interface of choice for own extensions to resolvers.
 289  *
 290  * - Entity(m,r): You can implementy every behaviour by using a customized
 291  *   entity class. Once the DTD object d is known that will be used during
 292  *   parsing, the entity  e = m d  is determined and used together with the
 293  *   resolver r.
 294  *   This is only for hackers.
 295  *)
 296
 297
 298
 299 val default_config : config
 300   (* - Warnings are thrown away
 301    * - Error messages will contain line numbers
 302    * - Neither T_super_root nor T_pinstr nor T_comment nodes are generated
 303    * - The internal encoding is ISO-8859-1
 304    * - The standalone declaration is checked
 305    * - Element positions are stored
 306    * - The IDREF pass is left out
 307    * - If available, DFAs are used for validation
 308    * - Only deterministic content models are accepted
 309    *)
 310
 311 val default_extension : (&apos;a node extension) as &apos;a
 312   (* A "null" extension; an extension that does not extend the functionality *)
 313
 314 val default_spec : (&apos;a node extension as &apos;a) spec
 315   (* Specifies that you do not want to use extensions. *)
 316
 317 val parse_dtd_entity : config -&gt; source -&gt; dtd
 318   (* Parse an entity containing a DTD (external subset), and return this DTD. *)
 319
 320 val extract_dtd_from_document_entity : config -&gt; source -&gt; dtd
 321   (* Parses a closed document, i.e. a document beginning with &lt;!DOCTYPE...&gt;,
 322    * and returns the DTD contained in the document.
 323    * The parts of the document outside the DTD are actually not parsed,
 324    * i.e. parsing stops when all declarations of the DTD have been read.
 325    *)
 326
 327 val parse_document_entity :
 328   ?transform_dtd:(dtd -&gt; dtd) -&gt;
 329   ?id_index:(&apos;ext index) -&gt;
 330   config -&gt; source -&gt; &apos;ext spec -&gt; &apos;ext document
 331   (* Parse a closed document, i.e. a document beginning with &lt;!DOCTYPE...&gt;,
 332    * and validate the contents of the document against the DTD contained
 333    * and/or referenced in the document.
 334    *
 335    * If the optional argument ~transform_dtd is passed, the following
 336    * modification applies: After the DTD (both the internal and external
 337    * subsets) has been parsed, the function ~transform_dtd is called,
 338    * and the resulting DTD is actually used to validate the document.
 339    *
 340    * If the optional argument ~transform_dtd is missing, the parser
 341    * behaves in the same way as if the identity were passed as ~transform_dtd.
 342    *
 343    * If the optional argument ~id_index is present, the parser adds
 344    * any ID attribute to the passed index. An index is required to detect
 345    * violations of the uniqueness of IDs.
 346    *)
 347
 348 val parse_wfdocument_entity :
 349   config -&gt; source -&gt; &apos;ext spec -&gt; &apos;ext document
 350   (* Parse a closed document (see parse_document_entity), but do not
 351    * validate it. Only checks on well-formedness are performed.
 352    *)
 353
 354 val parse_content_entity  :
 355   ?id_index:(&apos;ext index) -&gt;
 356   config -&gt; source -&gt; dtd -&gt; &apos;ext spec -&gt; &apos;ext node
 357   (* Parse a file representing a well-formed fragment of a document. The
 358    * fragment must be a single element (i.e. something like &lt;a&gt;...&lt;/a&gt;;
 359    * not a sequence like &lt;a&gt;...&lt;/a&gt;&lt;b&gt;...&lt;/b&gt;). The element is validated
 360    * against the passed DTD, but it is not checked whether the element is
 361    * the root element specified in the DTD.
 362    *
 363    * If the optional argument ~id_index is present, the parser adds
 364    * any ID attribute to the passed index. An index is required to detect
 365    * violations of the uniqueness of IDs.
 366    *)
 367
 368 val parse_wfcontent_entity :
 369   config -&gt; source -&gt; &apos;ext spec -&gt; &apos;ext node
 370   (* Parse a file representing a well-formed fragment of a document
 371    * (see parse_content_entity). The fragment is not validated, only
 372    * checked for well-formedness.
 373    *)
 374
 375
 376 '>