helm/DEVEL/pxp/pxp/pxp_lexer_types.ml

   1 (* $Id$
   2  * ----------------------------------------------------------------------
   3  * PXP: The polymorphic XML parser for Objective Caml.
   4  * Copyright by Gerd Stolpmann. See LICENSE for details.
   5  *)
   6
   7 type lexers =
   8     Document
   9   | Document_type
  10   | Content
  11   | Within_tag
  12   | Declaration
  13   | Content_comment
  14   | Decl_comment
  15   | Document_comment
  16   | Ignored_section
  17
  18
  19 type prolog_token =
  20     Pro_name of string
  21   | Pro_eq                  (* "=" *)
  22   | Pro_string of string    (* "..." or '...' *)
  23   | Pro_eof
  24
  25
  26 type entity_id = < >
  27   (* The class without properties; but you can still compare if two objects
  28    * are the same.
  29    *)
  30
  31 type token =
  32   | Begin_entity             (* Beginning of entity *)
  33   | End_entity               (* End of entity *)
  34   | Comment_begin            (* <!-- *)
  35   | Comment_material of string (* within a comment *)
  36   | Comment_end              (* --> *)
  37   | Ignore                   (* ignored whitespace *)
  38   | Eq                       (* = *)
  39   | Rangle                   (* > as tag delimiter *)
  40   | Rangle_empty             (* /> as tag delimiter *)
  41   | Percent                  (* % followed by space in declaration *)
  42   | Plus                     (* + in declaration *)
  43   | Star                     (* * in declaration *)
  44   | Bar                      (* | in declaration *)
  45   | Comma                    (* , in declaration *)
  46   | Qmark                    (* ? in declaration *)
  47   | Pcdata                   (* #PCDATA in declaration *)
  48   | Required                 (* #REQUIRED in declaration *)
  49   | Implied                  (* #IMPLIED in declaration *)
  50   | Fixed                    (* #FIXED in declaration *)
  51   | Bof                      (* A marker for 'beginning of file' *)
  52   | Eof                      (* End of file *)
  53   | Conditional_begin of entity_id  (* <![ in declaration *)
  54   | Conditional_body  of entity_id  (* [ in declaration *)
  55   | Conditional_end   of entity_id  (* ]]> in declaration *)
  56   | Doctype        of entity_id  (* <!DOCTYPE *)
  57   | Doctype_rangle of entity_id  (* > as DOCTYPE delimiter *)
  58   | Dtd_begin      of entity_id  (* '[' after DOCTYPE *)
  59   | Dtd_end        of entity_id  (* ']' *)
  60   | Decl_element   of entity_id  (* <!ELEMENT *)
  61   | Decl_attlist   of entity_id  (* <!ATTLIST *)
  62   | Decl_entity    of entity_id  (* <!ENTITY *)
  63   | Decl_notation  of entity_id  (* <!NOTATION *)
  64   | Decl_rangle    of entity_id  (* > *)
  65   | Lparen         of entity_id  (* ( in declaration *)
  66   | Rparen         of entity_id  (* ) in declaration *)
  67   | RparenPlus     of entity_id  (* )+ in declaration *)
  68   | RparenStar     of entity_id  (* )* in declaration *)
  69   | RparenQmark    of entity_id  (* )? in declaration *)
  70
  71   | Tag_beg of (string*entity_id)     (* <name *)
  72   | Tag_end of (string*entity_id)     (* </name *)
  73
  74   | PI        of (string*string)      (* <?name ... ?> *)
  75   | PI_xml    of (prolog_token list)  (* <?xml ...?> *)
  76   | Cdata     of string               (* <![CDATA[...]]> *)
  77   | CRef      of int                  (* &#digits; *)
  78   | ERef      of string               (* &name; *)
  79   | PERef     of string               (* %name; *)
  80   | CharData  of string             (* any characters not otherwise matching *)
  81   | LineEnd   of string
  82   | Name      of string               (* name *)
  83   | Nametoken of string               (* nmtoken but not name *)
  84   | Attval    of string           (* attribute value; may contain entity refs *)
  85   | Attval_nl_normalized of string
  86   | Unparsed_string      of string    (* "data" or 'data' *)
  87
  88
  89 (**********************************************************************)
  90 (* debugging *)
  91
  92 let string_of_tok tok =
  93   match tok with
  94     Begin_entity -> "Begin_entity"
  95   | End_entity -> "End_entity"
  96   | Doctype _ -> "Doctype"
  97   | Doctype_rangle _ -> "Doctype_rangle"
  98   | Comment_begin -> "Comment_begin"
  99   | Comment_end -> "Comment_end"
 100   | Comment_material _ -> "Comment_material"
 101   | Rangle -> "Rangle"
 102   | Rangle_empty -> "Rangle_empty"
 103   | Ignore -> "Ignore"
 104   | Eq -> "Eq"
 105   | Dtd_begin _ -> "Dtd_begin"
 106   | Dtd_end _ -> "Dtd_end"
 107   | Conditional_begin _ -> "Conditional_begin"
 108   | Conditional_body _ -> "Conditional_body"
 109   | Conditional_end _ -> "Conditional_end"
 110   | Percent -> "Percent"
 111   | Lparen _ -> "Lparen"
 112   | Rparen _ -> "Rparen"
 113   | Plus -> "Plus"
 114   | Star -> "Star"
 115   | Bar -> "Bar"
 116   | Comma -> "Comma"
 117   | Qmark -> "Qmark"
 118   | Pcdata -> "Pcdata"
 119   | Required -> "Required"
 120   | Implied -> "Implied"
 121   | Fixed -> "Fixed"
 122   | Decl_element _ -> "Decl_element"
 123   | Decl_attlist _ -> "Decl_attlist"
 124   | Decl_entity _ -> "Decl_entity"
 125   | Decl_notation _ -> "Decl_notation"
 126   | Decl_rangle _ -> "Decl_rangle"
 127   | RparenPlus _ -> "RparenPlus"
 128   | RparenStar _ -> "RparenStar"
 129   | RparenQmark _ -> "RparenQmark"
 130   | Bof -> "Bof"
 131   | Eof -> "Eof"
 132   | PI _ -> "PI"
 133   | PI_xml _ -> "PI_xml"
 134   | Tag_beg _ -> "Tag_beg"
 135   | Tag_end _ -> "Tag_end"
 136   | Cdata _ -> "Cdata"
 137   | CRef _ -> "CRef"
 138   | ERef _ -> "ERef"
 139   | PERef _ -> "PERef"
 140   | CharData _ -> "CharData"
 141   | Name _ -> "Name"
 142   | Nametoken _ -> "Nametoken"
 143   | Attval _ -> "Attval"
 144   | Attval_nl_normalized _ -> "Attval_nl_normalized"
 145   | Unparsed_string _ -> "Unparsed_string"
 146   | LineEnd _ -> "LineEnd"
 147
 148
 149 type lexer_set =
 150     { lex_encoding         : Pxp_types.rep_encoding;
 151       scan_document        : Lexing.lexbuf -> (token * lexers);
 152       scan_content         : Lexing.lexbuf -> (token * lexers);
 153       scan_within_tag      : Lexing.lexbuf -> (token * lexers);
 154       scan_document_type   : Lexing.lexbuf -> (token * lexers);
 155       scan_declaration     : Lexing.lexbuf -> (token * lexers);
 156       scan_content_comment : Lexing.lexbuf -> (token * lexers);
 157       scan_decl_comment    : Lexing.lexbuf -> (token * lexers);
 158       scan_document_comment: Lexing.lexbuf -> (token * lexers);
 159       scan_ignored_section : Lexing.lexbuf -> (token * lexers);
 160       scan_xml_pi          : Lexing.lexbuf -> prolog_token;
 161       scan_dtd_string      : Lexing.lexbuf -> token;
 162       scan_content_string  : Lexing.lexbuf -> token;
 163       scan_name_string     : Lexing.lexbuf -> token;
 164       scan_only_xml_decl   : Lexing.lexbuf -> token;
 165       scan_for_crlf        : Lexing.lexbuf -> token;
 166     }
 167
 168 (* ======================================================================
 169  * History:
 170  *
 171  * $Log$
 172  * Revision 1.1  2000/11/17 09:57:29  lpadovan
 173  * Initial revision
 174  *
 175  * Revision 1.2  2000/08/18 20:14:31  gerd
 176  *      Comment -> Comment_begin, Comment_material, Comment_end.
 177  *
 178  * Revision 1.1  2000/05/29 23:48:38  gerd
 179  *      Changed module names:
 180  *              Markup_aux          into Pxp_aux
 181  *              Markup_codewriter   into Pxp_codewriter
 182  *              Markup_document     into Pxp_document
 183  *              Markup_dtd          into Pxp_dtd
 184  *              Markup_entity       into Pxp_entity
 185  *              Markup_lexer_types  into Pxp_lexer_types
 186  *              Markup_reader       into Pxp_reader
 187  *              Markup_types        into Pxp_types
 188  *              Markup_yacc         into Pxp_yacc
 189  * See directory "compatibility" for (almost) compatible wrappers emulating
 190  * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
 191  *
 192  * ======================================================================
 193  * Old logs from markup_lexer_types.ml:
 194  *
 195  * Revision 1.6  2000/05/29 21:14:57  gerd
 196  *      Changed the type 'encoding' into a polymorphic variant.
 197  *
 198  * Revision 1.5  2000/05/20 20:31:40  gerd
 199  *      Big change: Added support for various encodings of the
 200  * internal representation.
 201  *
 202  * Revision 1.4  2000/05/14 17:45:36  gerd
 203  *      Bugfix.
 204  *
 205  * Revision 1.3  2000/05/14 17:35:12  gerd
 206  *      Conditional_begin, _end, and _body have an entity_id.
 207  *
 208  * Revision 1.2  2000/05/08 21:59:06  gerd
 209  *      New token Bof (beginning of file).
 210  *
 211  * Revision 1.1  2000/05/06 23:21:49  gerd
 212  *      Initial revision.
 213  *
 214  *
 215  * ======================================================================
 216  *
 217  * DERIVED FROM REVISION 1.4 of markup_lexer_types_shadow.ml
 218  *
 219  * Revision 1.4  2000/04/30 18:19:04  gerd
 220  *      Added new tokens.
 221  *
 222  * Revision 1.3  1999/08/31 19:13:31  gerd
 223  *      Added checks on proper PE nesting. The idea is that tokens such
 224  * as Decl_element and Decl_rangle carry an entity ID with them. This ID
 225  * is simply an object of type < >, i.e. you can only test on identity.
 226  * The lexer always produces tokens with a dummy ID because it does not
 227  * know which entity is the current one. The entity layer replaces the dummy
 228  * ID with the actual ID. The parser checks that the IDs of pairs such as
 229  * Decl_element and Decl_rangle are the same; otherwise a Validation_error
 230  * is produced.
 231  *
 232  * Revision 1.2  1999/08/10 21:35:08  gerd
 233  *      The XML/encoding declaration at the beginning of entities is
 234  * evaluated. In particular, entities have now a method "xml_declaration"
 235  * which returns the name/value pairs of such a declaration. The "encoding"
 236  * setting is interpreted by the entity itself; "version", and "standalone"
 237  * are interpreted by Markup_yacc.parse_document_entity. Other settings
 238  * are ignored (this does not conform to the standard; the standard prescribes
 239  * that "version" MUST be given in the declaration of document; "standalone"
 240  * and "encoding" CAN be declared; no other settings are allowed).
 241  *      TODO: The user should be warned if the standard is not exactly
 242  * fulfilled. -- The "standalone" property is not checked yet.
 243  *
 244  * Revision 1.1  1999/08/10 00:35:51  gerd
 245  *      Initial revision.
 246  *
 247  *
 248  *)