helm/DEVEL/pxp/pxp/pxp_yacc.mli

   1 (* $Id$
   2  * ----------------------------------------------------------------------
   3  * PXP: The polymorphic XML parser for Objective Caml.
   4  * Copyright by Gerd Stolpmann. See LICENSE for details.
   5  *)
   6
   7
   8 (*$ markup-yacc.mli *)
   9
  10 open Pxp_types
  11 open Pxp_dtd
  12 open Pxp_document
  13
  14 exception ID_not_unique
  15
  16 class type [ 'ext ] index =
  17 object
  18   (* The type of indexes over the ID attributes of the elements. This type
  19    * is the minimum requirement needed by the parser to create such an index.
  20    *)
  21   constraint 'ext = 'ext node #extension
  22   method add : string -> 'ext node -> unit
  23     (* Add the passed node to the index. If there is already an ID with
  24      * the passed string value, the exception ID_not_unique should be
  25      * raised. (But the index is free also to accept several identical IDs.)
  26      *)
  27   method find : string -> 'ext node
  28     (* Finds the node with the passed ID value, or raises Not_found *)
  29 end
  30 ;;
  31
  32
  33 class [ 'ext ] hash_index :
  34 object
  35   (* This is a simple implementation of 'index' using a hash table. *)
  36   constraint 'ext = 'ext node #extension
  37   method add : string -> 'ext node -> unit
  38     (* See above. *)
  39   method find : string -> 'ext node
  40     (* See above. *)
  41   method index : (string, 'ext node) Hashtbl.t
  42     (* Returns the hash table. *)
  43 end
  44 ;;
  45
  46
  47 type config =
  48     { warner : collect_warnings;
  49          (* An object that collects warnings. *)
  50
  51       errors_with_line_numbers : bool;
  52          (* Whether error messages contain line numbers or not. The parser
  53           * is 10 to 20 per cent faster if line numbers are turned off;
  54           * you get only byte positions in this case.
  55           *)
  56
  57       enable_pinstr_nodes : bool;
  58          (* true: turns a special mode for processing instructions on. Normally,
  59           * you cannot determine the exact location of a PI; you only know
  60           * in which element the PI occurs. This mode makes it possible
  61           * to find the exact location out: Every PI is artificially wrapped
  62           * by a special node with type T_pinstr. For example, if the XML text
  63           * is <a><?x?><?y?></a>, the parser normally produces only an element
  64           * object for "a", and puts the PIs "x" and "y" into it (without
  65           * order). In this mode, the object "a" will contain two objects
  66           * with type T_pinstr, and the first object will contain "x", and the
  67           * second "y": the object tree looks like
  68           * - Node with type = T_element "a"
  69           *   - Node with type = T_pinstr "x"
  70           *     + contains processing instruction "x"
  71           *   - Node with type = T_pinstr "y"
  72           *     + contains processing instruction "y"
  73           *
  74           * Notes:
  75           * (1) In past versions of PXP this mode was called
  76           *     processing_instructions_inline, and it produced nodes of
  77           *     type T_element "-pi" instead of T_pinstr.
  78           * (2) The T_pinstr nodes are created from the pinstr exemplars
  79           *     in your spec
  80           *)
  81
  82       enable_super_root_node : bool;
  83          (* true: the topmost element of the XML tree is not the root element,
  84           * but the so-called super root. The root element is a son of the
  85           * super root. The super root is a node with type T_super_root.
  86           * The following behaviour changes, too:
  87           * - PIs occurring outside the root element and outside the DTD are
  88           *   added to the super root instead of the document object
  89           * - If enable_pinstr_nodes is also turned on, the PI wrappers
  90           *   are added to the super root
  91           *
  92           * For example, the document
  93           *   <?x?><a>y</a><?y?>
  94           * is normally represented by:
  95           * - document object
  96           *   + contains PIs x and y
  97           *   - reference to root node with type = T_element "a"
  98           *     - node with type = T_data: contains "y"
  99           * With enabled super root node:
 100           * - document object
 101           *   - reference to super root node with type = T_super_root
 102           *     + contains PIs x and y
 103           *     - root node with type = T_element "a"
 104           *       - node with type = T_data: contains "y"
 105           * If also enable_pinstr_nodes:
 106           * - document object
 107           *   - reference to super root node with type = T_super_root
 108           *     - node with type = T_pinstr "x"
 109           *       + contains PI "x"
 110           *     - root node with type = T_element "a"
 111           *       - node with type = T_data: contains "y"
 112           *     - node with type = T_pinstr "y"
 113           *       + contains PI "y"
 114           * Notes:
 115           * (1) In previous versions of PXP this mode was called
 116           *     virtual_root, and it produced an additional node of type
 117           *     T_element "-vr" instead of T_super_root.
 118           * (2) The T_super_root node is created from the super root exemplar
 119           *     in your spec.
 120           *)
 121
 122       enable_comment_nodes : bool;
 123          (* When enabled, comments are represented as nodes with type =
 124           * T_comment.
 125           * To access the contents of comments, use the method "comment"
 126           * for the comment nodes.
 127           * These nodes behave like elements; however, they are normally
 128           * empty and do not have attributes. Note that it is possible to
 129           * add children to comment nodes and to set attributes, but it is
 130           * strongly recommended not to do so. There are no checks on
 131           * such abnormal use, because they would cost too
 132           * much time, even when no comment nodes are generated at all.
 133           *
 134           * Comment nodes should be disabled unless you must parse a
 135           * third-party XML text which uses comments as another data
 136           * container.
 137           *
 138           * The nodes of type T_comment are created from the comment exemplars
 139           * in your spec.
 140           *)
 141
 142       encoding : rep_encoding;
 143         (* Specifies the encoding used for the *internal* representation
 144          * of any character data.
 145          * Note that the default is still Enc_iso88591.
 146          *)
 147
 148       recognize_standalone_declaration : bool;
 149         (* Whether the "standalone" declaration is recognized or not.
 150          * This option does not have an effect on well-formedness parsing:
 151          * in this case such declarations are never recognized.
 152          *
 153          * Recognizing the "standalone" declaration means that the
 154          * value of the declaration is scanned and passed to the DTD,
 155          * and that the "standalone-check" is performed.
 156          *
 157          * Standalone-check: If a document is flagged standalone='yes'
 158          * some additional constraints apply. The idea is that a parser
 159          * without access to any external document subsets can still parse
 160          * the document, and will still return the same values as the parser
 161          * with such access. For example, if the DTD is external and if
 162          * there are attributes with default values, it is checked that there
 163          * is no element instance where these attributes are omitted - the
 164          * parser would return the default value but this requires access to
 165          * the external DTD subset.
 166          *)
 167
 168       store_element_positions : bool;
 169         (* Whether the file name, the line and the column of the
 170          * beginning of elements are stored in the element nodes.
 171          * This option may be useful to generate error messages.
 172          *
 173          * Positions are only stored for:
 174          * - Elements
 175          * - Wrapped processing instructions (see enable_pinstr_nodes)
 176          * For all other node types, no position is stored.
 177          *
 178          * You can access positions by the method "position" of nodes.
 179          *)
 180
 181       idref_pass : bool;
 182         (* Whether the parser does a second pass and checks that all
 183          * IDREF and IDREFS attributes contain valid references.
 184          * This option works only if an ID index is available. To create
 185          * an ID index, pass an index object as id_index argument to the
 186          * parsing functions (such as parse_document_entity; see below).
 187          *
 188          * "Second pass" does not mean that the XML text is again parsed;
 189          * only the existing document tree is traversed, and the check
 190          * on bad IDREF/IDREFS attributes is performed for every node.
 191          *)
 192
 193       validate_by_dfa : bool;
 194         (* If true, and if DFAs are available for validation, the DFAs will
 195          * actually be used for validation.
 196          * If false, or if no DFAs are available, the standard backtracking
 197          * algorithm will be used.
 198          * DFA = deterministic finite automaton.
 199          *
 200          * DFAs are only available if accept_only_deterministic_models is
 201          * "true" (because in this case, it is relatively cheap to construct
 202          * the DFAs). DFAs are a data structure which ensures that validation
 203          * can always be performed in linear time.
 204          *
 205          * I strongly recommend using DFAs; however, there are examples
 206          * for which validation by backtracking is faster.
 207          *)
 208
 209       accept_only_deterministic_models : bool;
 210         (* Whether only deterministic content models are accepted in DTDs. *)
 211
 212       (* The following options are not implemented, or only for internal
 213        * use.
 214        *)
 215
 216       debugging_mode : bool;
 217     }
 218
 219
 220 type source =
 221     Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
 222   | ExtID of (ext_id * Pxp_reader.resolver)
 223
 224 val from_channel :
 225       ?system_encoding:encoding -> ?id:ext_id -> ?fixenc:encoding ->
 226       in_channel -> source
 227
 228 val from_string :
 229       ?fixenc:encoding -> string -> source
 230
 231 val from_file :
 232       ?system_encoding:encoding -> string -> source
 233
 234 (* Notes on sources (version 2):
 235  *
 236  * Sources specify where the XML text to parse comes from. Sources not only
 237  * represent character streams, but also external IDs (i.e. SYSTEM or PUBLIC
 238  * names), and they are interpreted as a specific encoding of characters.
 239  * A source should be associated with an external ID, because otherwise
 240  * it is not known how to handle relative names.
 241  *
 242  * There are two primary sources, Entity and ExtID, and several functions
 243  * for derived sources. First explanations for the functions:
 244  *
 245  * from_channel: The XML text is read from an in_channel. By default, the
 246  *   channel is not associated with an external ID, and it is impossible
 247  *   to resolve relative SYSTEM IDs found in the document.
 248  *   If the ?id argument is passed, it is assumed that the channel has this
 249  *   external ID. If relative SYSTEM IDs occur in the document, they can
 250  *   be interpreted; however, it is only possible to read from "file:"
 251  *   IDs.
 252  *   By default, the channel automatically detects the encoding. You can
 253  *   set a fixed encoding by passing the ?fixenc argument.
 254  *
 255  * from_string: The XML text is read from a string.
 256  *   It is impossible to read from any external entity whose reference is found
 257  *   in the string.
 258  *   By default, the encoding of the string is detected automatically. You can
 259  *   set a fixed encoding by passing the ?fixenc argument.
 260  *
 261  * from_file: The XML text is read from the file whose file name is
 262  *   passed to the function (as UTF-8 string).
 263  *   Relative system IDs can be interpreted by this function.
 264  *   The ?system_encoding argument specifies the character encoding used
 265  *   for file names (sic!). By default, UTF-8 is assumed.
 266  *
 267  * Examples:
 268  *
 269  * from_file "/tmp/file.xml":
 270  *   reads from this file, which is assumed to have the ID
 271  *   SYSTEM "file://localhost/tmp/file.xml".
 272  *
 273  * let ch = open_in "/tmp/file.xml" in
 274  * from_channel ~id:(System "file://localhost/tmp/file.xml") ch
 275  *   This does the same, but uses a channel.
 276  *
 277  * from_channel ~id:(System "http://host/file.xml")
 278  *              ch
 279  *   reads from the channel ch, and it is assumed that the ID is
 280  *   SYSTEM "http://host/file.xml". If there is any relative SYSTEM ID,
 281  *   it will be interpreted relative to this location; however, there is
 282  *   no way to read via HTTP.
 283  *   If there is any "file:" SYSTEM ID, it is possible to read the file.
 284  *
 285  * The primary sources:
 286  *
 287  * - ExtID(x,r): The identifier x (either the SYSTEM or the PUBLIC name) of the
 288  *   entity to read from is passed to the resolver, and the resolver finds
 289  *   the entity and opens it.
 290  *   The intention of this option is to allow customized
 291  *   resolvers to interpret external identifiers without any restriction.
 292  *   The Pxp_reader module contains several classes allowing the user to
 293  *   compose such a customized resolver from predefined components.
 294  *
 295  *   ExtID is the interface of choice for own extensions to resolvers.
 296  *
 297  * - Entity(m,r): You can implementy every behaviour by using a customized
 298  *   entity class. Once the DTD object d is known that will be used during
 299  *   parsing, the entity  e = m d  is determined and used together with the
 300  *   resolver r.
 301  *   This is only for hackers.
 302  *)
 303
 304
 305
 306 val default_config : config
 307   (* - Warnings are thrown away
 308    * - Error messages will contain line numbers
 309    * - Neither T_super_root nor T_pinstr nor T_comment nodes are generated
 310    * - The internal encoding is ISO-8859-1
 311    * - The standalone declaration is checked
 312    * - Element positions are stored
 313    * - The IDREF pass is left out
 314    * - If available, DFAs are used for validation
 315    * - Only deterministic content models are accepted
 316    *)
 317
 318 val default_extension : ('a node extension) as 'a
 319   (* A "null" extension; an extension that does not extend the functionality *)
 320
 321 val default_spec : ('a node extension as 'a) spec
 322   (* Specifies that you do not want to use extensions. *)
 323
 324 val parse_dtd_entity : config -> source -> dtd
 325   (* Parse an entity containing a DTD (external subset), and return this DTD. *)
 326
 327 val extract_dtd_from_document_entity : config -> source -> dtd
 328   (* Parses a closed document, i.e. a document beginning with <!DOCTYPE...>,
 329    * and returns the DTD contained in the document.
 330    * The parts of the document outside the DTD are actually not parsed,
 331    * i.e. parsing stops when all declarations of the DTD have been read.
 332    *)
 333
 334 val parse_document_entity :
 335   ?transform_dtd:(dtd -> dtd) ->
 336   ?id_index:('ext index) ->
 337   config -> source -> 'ext spec -> 'ext document
 338   (* Parse a closed document, i.e. a document beginning with <!DOCTYPE...>,
 339    * and validate the contents of the document against the DTD contained
 340    * and/or referenced in the document.
 341    *
 342    * If the optional argument ~transform_dtd is passed, the following
 343    * modification applies: After the DTD (both the internal and external
 344    * subsets) has been parsed, the function ~transform_dtd is called,
 345    * and the resulting DTD is actually used to validate the document.
 346    *
 347    * If the optional argument ~transform_dtd is missing, the parser
 348    * behaves in the same way as if the identity were passed as ~transform_dtd.
 349    *
 350    * If the optional argument ~id_index is present, the parser adds
 351    * any ID attribute to the passed index. An index is required to detect
 352    * violations of the uniqueness of IDs.
 353    *)
 354
 355 val parse_wfdocument_entity :
 356   config -> source -> 'ext spec -> 'ext document
 357   (* Parse a closed document (see parse_document_entity), but do not
 358    * validate it. Only checks on well-formedness are performed.
 359    *)
 360
 361 val parse_content_entity  :
 362   ?id_index:('ext index) ->
 363   config -> source -> dtd -> 'ext spec -> 'ext node
 364   (* Parse a file representing a well-formed fragment of a document. The
 365    * fragment must be a single element (i.e. something like <a>...</a>;
 366    * not a sequence like <a>...</a><b>...</b>). The element is validated
 367    * against the passed DTD, but it is not checked whether the element is
 368    * the root element specified in the DTD.
 369    *
 370    * If the optional argument ~id_index is present, the parser adds
 371    * any ID attribute to the passed index. An index is required to detect
 372    * violations of the uniqueness of IDs.
 373    *)
 374
 375 val parse_wfcontent_entity :
 376   config -> source -> 'ext spec -> 'ext node
 377   (* Parse a file representing a well-formed fragment of a document
 378    * (see parse_content_entity). The fragment is not validated, only
 379    * checked for well-formedness.
 380    *)
 381
 382
 383 (*$-*)
 384
 385
 386 (* ======================================================================
 387  * History:
 388  *
 389  * $Log$
 390  * Revision 1.1  2000/11/17 09:57:30  lpadovan
 391  * Initial revision
 392  *
 393  * Revision 1.7  2000/08/18 20:15:43  gerd
 394  *      Config options:
 395  * - enable_super_root_nodes: new name for virtual_root
 396  * - enable_pinstr_nodes: new name for processing_instructions_inline
 397  * - enable_comment_nodes: new option
 398  *      Updated comments for various options.
 399  *
 400  * Revision 1.6  2000/07/23 02:16:33  gerd
 401  *      Support for DFAs.
 402  *
 403  * Revision 1.5  2000/07/14 13:57:29  gerd
 404  *      Added the id_index feature.
 405  *
 406  * Revision 1.4  2000/07/09 17:52:54  gerd
 407  *      New option store_element_positions.
 408  *
 409  * Revision 1.3  2000/07/08 16:26:21  gerd
 410  *      Added the signatures of the functions
 411  * 'extract_dtd_from_document_entity' and 'parse_wfcontent_entity'.
 412  * Updated the signature of 'parse_document_entity': New optional
 413  * argument 'transform_dtd'.
 414  *      Updated the comments.
 415  *
 416  * Revision 1.2  2000/07/04 22:09:03  gerd
 417  *      MAJOR CHANGE: Redesign of the interface (not yet complete).
 418  *
 419  * Revision 1.1  2000/05/29 23:48:38  gerd
 420  *      Changed module names:
 421  *              Markup_aux          into Pxp_aux
 422  *              Markup_codewriter   into Pxp_codewriter
 423  *              Markup_document     into Pxp_document
 424  *              Markup_dtd          into Pxp_dtd
 425  *              Markup_entity       into Pxp_entity
 426  *              Markup_lexer_types  into Pxp_lexer_types
 427  *              Markup_reader       into Pxp_reader
 428  *              Markup_types        into Pxp_types
 429  *              Markup_yacc         into Pxp_yacc
 430  * See directory "compatibility" for (almost) compatible wrappers emulating
 431  * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
 432  *
 433  * ======================================================================
 434  * Old logs from markup_yacc.mli:
 435  *
 436  * Revision 1.4  2000/05/29 21:14:57  gerd
 437  *      Changed the type 'encoding' into a polymorphic variant.
 438  *
 439  * Revision 1.3  2000/05/27 19:24:01  gerd
 440  *      New option: recognize_standalone_declaration.
 441  *
 442  * Revision 1.2  2000/05/20 20:31:40  gerd
 443  *      Big change: Added support for various encodings of the
 444  * internal representation.
 445  *
 446  * Revision 1.1  2000/05/06 23:21:49  gerd
 447  *      Initial revision.
 448  *
 449  * Revision 1.9  2000/04/30 18:23:38  gerd
 450  *      New config options 'processing_instructions_inline' and
 451  * 'virtual_root'.
 452  *
 453  * Revision 1.8  2000/03/13 23:46:46  gerd
 454  *      Change: The 'resolver' component of the 'config' type has
 455  * disappeared. Instead, there is a new resolver component in the Entity
 456  * and ExtID values of 'source'. I hope that this makes clearer that the
 457  * resolver has only an effect if used together with Entity and ExtID
 458  * sources.
 459  *      Change: The Entity value can now return the entity dependent
 460  * on the DTD that is going to be used.
 461  *
 462  * Revision 1.7  2000/02/22 02:32:02  gerd
 463  *      Updated.
 464  *
 465  * Revision 1.6  2000/02/22 01:52:45  gerd
 466  *      Added documentation.
 467  *
 468  * Revision 1.5  2000/01/20 20:54:43  gerd
 469  *      New config.errors_with_line_numbers.
 470  *
 471  * Revision 1.4  1999/09/01 23:09:10  gerd
 472  *      New function parse_wf_entity that simulates a well-formedness
 473  * parser.
 474  *
 475  * Revision 1.3  1999/09/01 16:26:36  gerd
 476  *      Added an empty line. This is *really* a big change.
 477  *
 478  * Revision 1.2  1999/08/14 22:20:27  gerd
 479  *         The "config" slot has now a component "warner"which is
 480  * an object with a "warn" method. This is used to warn about characters
 481  * that cannot be represented in the Latin 1 alphabet.
 482  *         Furthermore, there is a new component "debugging_mode".
 483  *
 484  * Revision 1.1  1999/08/10 00:35:52  gerd
 485  *      Initial revision.
 486  *
 487  *
 488  *)