helm/DEVEL/pxp/pxp/lexers/pxp_lex_content.src

   1 (* $Id$
   2  * ----------------------------------------------------------------------
   3  *
   4  *)
   5
   6
   7 {
   8   open Pxp_types
   9   open Pxp_lexer_types
  10
  11 #insert pxp_lex_aux.src
  12
  13 #insert open_pxp_lex_aux_*.src
  14 #insert open_pxp_lex_misc_*.src
  15
  16 }
  17
  18 #insert pxp_lex_defs_*.def
  19
  20 rule scan_content = parse
  21     "<?" pi_string "?>"
  22       { scan_pi (Lexing.lexeme lexbuf) scan_xml_pi, Content }
  23   | "<?"
  24       { raise (WF_error ("Illegal processing instruction")) }
  25   | "<!--"
  26       { Comment_begin, Content_comment }
  27   | '<' '/'? name
  28       (* One rule for Tag_beg and Tag_end saves transitions. *)
  29       { let s = Lexing.lexeme lexbuf in
  30         if s.[1] = '/' then
  31           Tag_end (String.sub s 2 (String.length s - 2), dummy_entity),
  32           Within_tag
  33         else
  34           Tag_beg (String.sub s 1 (String.length s - 1), dummy_entity),
  35           Within_tag
  36       }
  37   | "<![CDATA[" cdata_string "]]>"
  38       { let s = Lexing.lexeme lexbuf in
  39         Cdata (String.sub s 9 (String.length s - 12)), Content }
  40   | "<!"
  41       { raise (WF_error "Declaration either malformed or not allowed in this context")
  42       }
  43   | "<"
  44       { raise (WF_error ("The left angle bracket '<' must be written as '&lt;'"))
  45       }
  46   | "&#" ascii_digit+ ";"
  47       { let s = Lexing.lexeme lexbuf in
  48         CRef (int_of_string (String.sub s 2 (String.length s - 3))), Content }
  49   | "&#x" ascii_hexdigit+ ";"
  50       { let s = Lexing.lexeme lexbuf in
  51         CRef (int_of_string ("0x" ^ String.sub s 3 (String.length s - 4))), Content }
  52   | "&" name ";"
  53       { let s = Lexing.lexeme lexbuf in
  54         ERef (String.sub s 1 (String.length s - 2)), Content }
  55   | "&"
  56       { raise (WF_error ("The ampersand '&' must be written as '&amp;'"))
  57       }
  58
  59   (* LineEnd: Depending on whether we are reading from a primary source
  60    * (file) or from the replacement text of an internal entity, line endings
  61    * must be normalized (converted to \n) or not.
  62    * The entity classes do that. The yacc parser will never see LineEnd;
  63    * this token is always converted to the appropriate CharData token.
  64    *)
  65
  66   | '\013' '\010'
  67       { tok_LineEndCRLF__Content }
  68   | '\013'
  69       { tok_LineEndCR__Content }
  70   | '\010'
  71       { tok_LineEndLF__Content }
  72   | eof
  73       { tok_Eof__Content }
  74   | "]]>"
  75       { raise (WF_error ("The sequence ']]>' must be written as ']]&gt;'"))
  76       }
  77   | "]"
  78       { tok_CharDataRBRACKET__Content }
  79   | normal_character+
  80       { let s = Lexing.lexeme lexbuf in
  81         CharData s, Content
  82       }
  83   | _
  84       { raise Netconversion.Malformed_code }
  85
  86
  87 (* ======================================================================
  88  * History:
  89  *
  90  * $Log$
  91  * Revision 1.1  2000/11/17 09:57:32  lpadovan
  92  * Initial revision
  93  *
  94  * Revision 1.4  2000/08/18 20:19:59  gerd
  95  *      Comments return different comment tokens.
  96  *
  97  * Revision 1.3  2000/08/14 22:18:34  gerd
  98  *      Bad_character_stream -> Netconversion.Malformed_code
  99  *
 100  * Revision 1.2  2000/05/29 23:53:12  gerd
 101  *      Updated because Markup_* modules have been renamed to Pxp_*.
 102  *
 103  * Revision 1.1  2000/05/20 20:33:25  gerd
 104  *      Initial revision.
 105  *
 106  *
 107  *)