(* $Id$ * ---------------------------------------------------------------------- * PXP: The polymorphic XML parser for Objective Caml. * Copyright by Gerd Stolpmann. See LICENSE for details. *) type lexers = Document | Document_type | Content | Within_tag | Declaration | Content_comment | Decl_comment | Document_comment | Ignored_section type prolog_token = Pro_name of string | Pro_eq (* "=" *) | Pro_string of string (* "..." or '...' *) | Pro_eof type entity_id = < > (* The class without properties; but you can still compare if two objects * are the same. *) type token = | Begin_entity (* Beginning of entity *) | End_entity (* End of entity *) | Comment_begin (* *) | Ignore (* ignored whitespace *) | Eq (* = *) | Rangle (* > as tag delimiter *) | Rangle_empty (* /> as tag delimiter *) | Percent (* % followed by space in declaration *) | Plus (* + in declaration *) | Star (* * in declaration *) | Bar (* | in declaration *) | Comma (* , in declaration *) | Qmark (* ? in declaration *) | Pcdata (* #PCDATA in declaration *) | Required (* #REQUIRED in declaration *) | Implied (* #IMPLIED in declaration *) | Fixed (* #FIXED in declaration *) | Bof (* A marker for 'beginning of file' *) | Eof (* End of file *) | Conditional_begin of entity_id (* in declaration *) | Doctype of entity_id (* as DOCTYPE delimiter *) | Dtd_begin of entity_id (* '[' after DOCTYPE *) | Dtd_end of entity_id (* ']' *) | Decl_element of entity_id (* *) | Lparen of entity_id (* ( in declaration *) | Rparen of entity_id (* ) in declaration *) | RparenPlus of entity_id (* )+ in declaration *) | RparenStar of entity_id (* )* in declaration *) | RparenQmark of entity_id (* )? in declaration *) | Tag_beg of (string*entity_id) (* *) | PI_xml of (prolog_token list) (* *) | Cdata of string (* *) | CRef of int (* &#digits; *) | ERef of string (* &name; *) | PERef of string (* %name; *) | CharData of string (* any characters not otherwise matching *) | LineEnd of string | Name of string (* name *) | Nametoken of string (* nmtoken but not name *) | Attval of string (* attribute value; may contain entity refs *) | Attval_nl_normalized of string | Unparsed_string of string (* "data" or 'data' *) (**********************************************************************) (* debugging *) let string_of_tok tok = match tok with Begin_entity -> "Begin_entity" | End_entity -> "End_entity" | Doctype _ -> "Doctype" | Doctype_rangle _ -> "Doctype_rangle" | Comment_begin -> "Comment_begin" | Comment_end -> "Comment_end" | Comment_material _ -> "Comment_material" | Rangle -> "Rangle" | Rangle_empty -> "Rangle_empty" | Ignore -> "Ignore" | Eq -> "Eq" | Dtd_begin _ -> "Dtd_begin" | Dtd_end _ -> "Dtd_end" | Conditional_begin _ -> "Conditional_begin" | Conditional_body _ -> "Conditional_body" | Conditional_end _ -> "Conditional_end" | Percent -> "Percent" | Lparen _ -> "Lparen" | Rparen _ -> "Rparen" | Plus -> "Plus" | Star -> "Star" | Bar -> "Bar" | Comma -> "Comma" | Qmark -> "Qmark" | Pcdata -> "Pcdata" | Required -> "Required" | Implied -> "Implied" | Fixed -> "Fixed" | Decl_element _ -> "Decl_element" | Decl_attlist _ -> "Decl_attlist" | Decl_entity _ -> "Decl_entity" | Decl_notation _ -> "Decl_notation" | Decl_rangle _ -> "Decl_rangle" | RparenPlus _ -> "RparenPlus" | RparenStar _ -> "RparenStar" | RparenQmark _ -> "RparenQmark" | Bof -> "Bof" | Eof -> "Eof" | PI _ -> "PI" | PI_xml _ -> "PI_xml" | Tag_beg _ -> "Tag_beg" | Tag_end _ -> "Tag_end" | Cdata _ -> "Cdata" | CRef _ -> "CRef" | ERef _ -> "ERef" | PERef _ -> "PERef" | CharData _ -> "CharData" | Name _ -> "Name" | Nametoken _ -> "Nametoken" | Attval _ -> "Attval" | Attval_nl_normalized _ -> "Attval_nl_normalized" | Unparsed_string _ -> "Unparsed_string" | LineEnd _ -> "LineEnd" type lexer_set = { lex_encoding : Pxp_types.rep_encoding; scan_document : Lexing.lexbuf -> (token * lexers); scan_content : Lexing.lexbuf -> (token * lexers); scan_within_tag : Lexing.lexbuf -> (token * lexers); scan_document_type : Lexing.lexbuf -> (token * lexers); scan_declaration : Lexing.lexbuf -> (token * lexers); scan_content_comment : Lexing.lexbuf -> (token * lexers); scan_decl_comment : Lexing.lexbuf -> (token * lexers); scan_document_comment: Lexing.lexbuf -> (token * lexers); scan_ignored_section : Lexing.lexbuf -> (token * lexers); scan_xml_pi : Lexing.lexbuf -> prolog_token; scan_dtd_string : Lexing.lexbuf -> token; scan_content_string : Lexing.lexbuf -> token; scan_name_string : Lexing.lexbuf -> token; scan_only_xml_decl : Lexing.lexbuf -> token; scan_for_crlf : Lexing.lexbuf -> token; } (* ====================================================================== * History: * * $Log$ * Revision 1.1 2000/11/17 09:57:29 lpadovan * Initial revision * * Revision 1.2 2000/08/18 20:14:31 gerd * Comment -> Comment_begin, Comment_material, Comment_end. * * Revision 1.1 2000/05/29 23:48:38 gerd * Changed module names: * Markup_aux into Pxp_aux * Markup_codewriter into Pxp_codewriter * Markup_document into Pxp_document * Markup_dtd into Pxp_dtd * Markup_entity into Pxp_entity * Markup_lexer_types into Pxp_lexer_types * Markup_reader into Pxp_reader * Markup_types into Pxp_types * Markup_yacc into Pxp_yacc * See directory "compatibility" for (almost) compatible wrappers emulating * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. * * ====================================================================== * Old logs from markup_lexer_types.ml: * * Revision 1.6 2000/05/29 21:14:57 gerd * Changed the type 'encoding' into a polymorphic variant. * * Revision 1.5 2000/05/20 20:31:40 gerd * Big change: Added support for various encodings of the * internal representation. * * Revision 1.4 2000/05/14 17:45:36 gerd * Bugfix. * * Revision 1.3 2000/05/14 17:35:12 gerd * Conditional_begin, _end, and _body have an entity_id. * * Revision 1.2 2000/05/08 21:59:06 gerd * New token Bof (beginning of file). * * Revision 1.1 2000/05/06 23:21:49 gerd * Initial revision. * * * ====================================================================== * * DERIVED FROM REVISION 1.4 of markup_lexer_types_shadow.ml * * Revision 1.4 2000/04/30 18:19:04 gerd * Added new tokens. * * Revision 1.3 1999/08/31 19:13:31 gerd * Added checks on proper PE nesting. The idea is that tokens such * as Decl_element and Decl_rangle carry an entity ID with them. This ID * is simply an object of type < >, i.e. you can only test on identity. * The lexer always produces tokens with a dummy ID because it does not * know which entity is the current one. The entity layer replaces the dummy * ID with the actual ID. The parser checks that the IDs of pairs such as * Decl_element and Decl_rangle are the same; otherwise a Validation_error * is produced. * * Revision 1.2 1999/08/10 21:35:08 gerd * The XML/encoding declaration at the beginning of entities is * evaluated. In particular, entities have now a method "xml_declaration" * which returns the name/value pairs of such a declaration. The "encoding" * setting is interpreted by the entity itself; "version", and "standalone" * are interpreted by Markup_yacc.parse_document_entity. Other settings * are ignored (this does not conform to the standard; the standard prescribes * that "version" MUST be given in the declaration of document; "standalone" * and "encoding" CAN be declared; no other settings are allowed). * TODO: The user should be warned if the standard is not exactly * fulfilled. -- The "standalone" property is not checked yet. * * Revision 1.1 1999/08/10 00:35:51 gerd * Initial revision. * * *)