X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_lexer_types.ml;fp=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_lexer_types.ml;h=988e9d08bf0822a3d8b88062f7ed7aa0286acd14;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/pxp/pxp_lexer_types.ml b/helm/DEVEL/pxp/pxp/pxp_lexer_types.ml new file mode 100644 index 000000000..988e9d08b --- /dev/null +++ b/helm/DEVEL/pxp/pxp/pxp_lexer_types.ml @@ -0,0 +1,248 @@ +(* $Id$ + * ---------------------------------------------------------------------- + * PXP: The polymorphic XML parser for Objective Caml. + * Copyright by Gerd Stolpmann. See LICENSE for details. + *) + +type lexers = + Document + | Document_type + | Content + | Within_tag + | Declaration + | Content_comment + | Decl_comment + | Document_comment + | Ignored_section + + +type prolog_token = + Pro_name of string + | Pro_eq (* "=" *) + | Pro_string of string (* "..." or '...' *) + | Pro_eof + + +type entity_id = < > + (* The class without properties; but you can still compare if two objects + * are the same. + *) + +type token = + | Begin_entity (* Beginning of entity *) + | End_entity (* End of entity *) + | Comment_begin (* *) + | Ignore (* ignored whitespace *) + | Eq (* = *) + | Rangle (* > as tag delimiter *) + | Rangle_empty (* /> as tag delimiter *) + | Percent (* % followed by space in declaration *) + | Plus (* + in declaration *) + | Star (* * in declaration *) + | Bar (* | in declaration *) + | Comma (* , in declaration *) + | Qmark (* ? in declaration *) + | Pcdata (* #PCDATA in declaration *) + | Required (* #REQUIRED in declaration *) + | Implied (* #IMPLIED in declaration *) + | Fixed (* #FIXED in declaration *) + | Bof (* A marker for 'beginning of file' *) + | Eof (* End of file *) + | Conditional_begin of entity_id (* in declaration *) + | Doctype of entity_id (* as DOCTYPE delimiter *) + | Dtd_begin of entity_id (* '[' after DOCTYPE *) + | Dtd_end of entity_id (* ']' *) + | Decl_element of entity_id (* *) + | Lparen of entity_id (* ( in declaration *) + | Rparen of entity_id (* ) in declaration *) + | RparenPlus of entity_id (* )+ in declaration *) + | RparenStar of entity_id (* )* in declaration *) + | RparenQmark of entity_id (* )? in declaration *) + + | Tag_beg of (string*entity_id) (* *) + | PI_xml of (prolog_token list) (* *) + | Cdata of string (* *) + | CRef of int (* &#digits; *) + | ERef of string (* &name; *) + | PERef of string (* %name; *) + | CharData of string (* any characters not otherwise matching *) + | LineEnd of string + | Name of string (* name *) + | Nametoken of string (* nmtoken but not name *) + | Attval of string (* attribute value; may contain entity refs *) + | Attval_nl_normalized of string + | Unparsed_string of string (* "data" or 'data' *) + + +(**********************************************************************) +(* debugging *) + +let string_of_tok tok = + match tok with + Begin_entity -> "Begin_entity" + | End_entity -> "End_entity" + | Doctype _ -> "Doctype" + | Doctype_rangle _ -> "Doctype_rangle" + | Comment_begin -> "Comment_begin" + | Comment_end -> "Comment_end" + | Comment_material _ -> "Comment_material" + | Rangle -> "Rangle" + | Rangle_empty -> "Rangle_empty" + | Ignore -> "Ignore" + | Eq -> "Eq" + | Dtd_begin _ -> "Dtd_begin" + | Dtd_end _ -> "Dtd_end" + | Conditional_begin _ -> "Conditional_begin" + | Conditional_body _ -> "Conditional_body" + | Conditional_end _ -> "Conditional_end" + | Percent -> "Percent" + | Lparen _ -> "Lparen" + | Rparen _ -> "Rparen" + | Plus -> "Plus" + | Star -> "Star" + | Bar -> "Bar" + | Comma -> "Comma" + | Qmark -> "Qmark" + | Pcdata -> "Pcdata" + | Required -> "Required" + | Implied -> "Implied" + | Fixed -> "Fixed" + | Decl_element _ -> "Decl_element" + | Decl_attlist _ -> "Decl_attlist" + | Decl_entity _ -> "Decl_entity" + | Decl_notation _ -> "Decl_notation" + | Decl_rangle _ -> "Decl_rangle" + | RparenPlus _ -> "RparenPlus" + | RparenStar _ -> "RparenStar" + | RparenQmark _ -> "RparenQmark" + | Bof -> "Bof" + | Eof -> "Eof" + | PI _ -> "PI" + | PI_xml _ -> "PI_xml" + | Tag_beg _ -> "Tag_beg" + | Tag_end _ -> "Tag_end" + | Cdata _ -> "Cdata" + | CRef _ -> "CRef" + | ERef _ -> "ERef" + | PERef _ -> "PERef" + | CharData _ -> "CharData" + | Name _ -> "Name" + | Nametoken _ -> "Nametoken" + | Attval _ -> "Attval" + | Attval_nl_normalized _ -> "Attval_nl_normalized" + | Unparsed_string _ -> "Unparsed_string" + | LineEnd _ -> "LineEnd" + + +type lexer_set = + { lex_encoding : Pxp_types.rep_encoding; + scan_document : Lexing.lexbuf -> (token * lexers); + scan_content : Lexing.lexbuf -> (token * lexers); + scan_within_tag : Lexing.lexbuf -> (token * lexers); + scan_document_type : Lexing.lexbuf -> (token * lexers); + scan_declaration : Lexing.lexbuf -> (token * lexers); + scan_content_comment : Lexing.lexbuf -> (token * lexers); + scan_decl_comment : Lexing.lexbuf -> (token * lexers); + scan_document_comment: Lexing.lexbuf -> (token * lexers); + scan_ignored_section : Lexing.lexbuf -> (token * lexers); + scan_xml_pi : Lexing.lexbuf -> prolog_token; + scan_dtd_string : Lexing.lexbuf -> token; + scan_content_string : Lexing.lexbuf -> token; + scan_name_string : Lexing.lexbuf -> token; + scan_only_xml_decl : Lexing.lexbuf -> token; + scan_for_crlf : Lexing.lexbuf -> token; + } + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:29 lpadovan + * Initial revision + * + * Revision 1.2 2000/08/18 20:14:31 gerd + * Comment -> Comment_begin, Comment_material, Comment_end. + * + * Revision 1.1 2000/05/29 23:48:38 gerd + * Changed module names: + * Markup_aux into Pxp_aux + * Markup_codewriter into Pxp_codewriter + * Markup_document into Pxp_document + * Markup_dtd into Pxp_dtd + * Markup_entity into Pxp_entity + * Markup_lexer_types into Pxp_lexer_types + * Markup_reader into Pxp_reader + * Markup_types into Pxp_types + * Markup_yacc into Pxp_yacc + * See directory "compatibility" for (almost) compatible wrappers emulating + * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. + * + * ====================================================================== + * Old logs from markup_lexer_types.ml: + * + * Revision 1.6 2000/05/29 21:14:57 gerd + * Changed the type 'encoding' into a polymorphic variant. + * + * Revision 1.5 2000/05/20 20:31:40 gerd + * Big change: Added support for various encodings of the + * internal representation. + * + * Revision 1.4 2000/05/14 17:45:36 gerd + * Bugfix. + * + * Revision 1.3 2000/05/14 17:35:12 gerd + * Conditional_begin, _end, and _body have an entity_id. + * + * Revision 1.2 2000/05/08 21:59:06 gerd + * New token Bof (beginning of file). + * + * Revision 1.1 2000/05/06 23:21:49 gerd + * Initial revision. + * + * + * ====================================================================== + * + * DERIVED FROM REVISION 1.4 of markup_lexer_types_shadow.ml + * + * Revision 1.4 2000/04/30 18:19:04 gerd + * Added new tokens. + * + * Revision 1.3 1999/08/31 19:13:31 gerd + * Added checks on proper PE nesting. The idea is that tokens such + * as Decl_element and Decl_rangle carry an entity ID with them. This ID + * is simply an object of type < >, i.e. you can only test on identity. + * The lexer always produces tokens with a dummy ID because it does not + * know which entity is the current one. The entity layer replaces the dummy + * ID with the actual ID. The parser checks that the IDs of pairs such as + * Decl_element and Decl_rangle are the same; otherwise a Validation_error + * is produced. + * + * Revision 1.2 1999/08/10 21:35:08 gerd + * The XML/encoding declaration at the beginning of entities is + * evaluated. In particular, entities have now a method "xml_declaration" + * which returns the name/value pairs of such a declaration. The "encoding" + * setting is interpreted by the entity itself; "version", and "standalone" + * are interpreted by Markup_yacc.parse_document_entity. Other settings + * are ignored (this does not conform to the standard; the standard prescribes + * that "version" MUST be given in the declaration of document; "standalone" + * and "encoding" CAN be declared; no other settings are allowed). + * TODO: The user should be warned if the standard is not exactly + * fulfilled. -- The "standalone" property is not checked yet. + * + * Revision 1.1 1999/08/10 00:35:51 gerd + * Initial revision. + * + * + *)