helm/DEVEL/pxp/netstring/nethtml_scanner.mll

   1 (* $Id$
   2  * ----------------------------------------------------------------------
   3  *
   4  *)
   5
   6 {
   7   type token =
   8       Lcomment
   9     | Rcomment
  10     | Mcomment
  11     | Ldoctype
  12     | Rdoctype
  13     | Mdoctype
  14     | Lelement of string
  15     | Lelementend of string
  16     | Relement
  17     | Cdata of string
  18     | Space of int
  19     | Name of string
  20     | Is
  21     | Literal of string
  22     | Other
  23     | Eof
  24 }
  25
  26 (* Simplified rules: Only Latin-1 is recognized as character set *)
  27
  28 let letter = ['A'-'Z' 'a'-'z' '\192'-'\214' '\216'-'\246' '\248'-'\255']
  29 let extender = '\183'
  30 let digit = ['0'-'9']
  31 let hexdigit = ['0'-'9' 'A'-'F' 'a'-'f']
  32 let namechar = letter | digit | '.' | ':' | '-' | '_' | extender
  33 let name = ( letter | '_' | ':' ) namechar*
  34 let nmtoken = namechar+
  35 let ws = [ ' ' '\t' '\r' '\n' ]
  36 let string_literal1 = '"' [^ '"' '>' '<' '\n']* '"'
  37 let string_literal2 = "'" [^ '\'' '>' '<' '\n']* "'"
  38
  39
  40 (* This following rules reflect HTML as it is used, not the SGML
  41  * rules.
  42  *)
  43
  44 rule scan_document = parse
  45   | "<!--"
  46       { Lcomment }
  47   | "<!"
  48       { Ldoctype }
  49   | "<" name
  50       { let s = Lexing.lexeme lexbuf in
  51         Lelement (String.sub s 1 (String.length s - 1))
  52       }
  53   | "</" name
  54       { let s = Lexing.lexeme lexbuf in
  55         Lelementend (String.sub s 2 (String.length s - 2))
  56       }
  57   | "<"                (* misplaced "<" *)
  58       { Cdata "<" }
  59   | eof
  60       { Eof }
  61   | [^ '<' ]+
  62       { Cdata (Lexing.lexeme lexbuf)}
  63
  64 and scan_special = parse
  65   | "</" name
  66       { let s = Lexing.lexeme lexbuf in
  67         Lelementend (String.sub s 2 (String.length s - 2))
  68       }
  69   | "<"
  70       { Cdata "<" }
  71   | eof
  72       { Eof }
  73   | [^ '<' ]+
  74       { Cdata (Lexing.lexeme lexbuf)}
  75
  76
  77 and scan_comment = parse
  78   | "-->"
  79       { Rcomment }
  80   | "-"
  81       { Mcomment }
  82   | eof
  83       { Eof }
  84   | [^ '-']+
  85       { Mcomment }
  86
  87 and scan_doctype = parse
  88   | ">"                   (* Occurence in strings, and [ ] brackets ignored *)
  89       { Rdoctype }
  90   | eof
  91       { Eof }
  92   | [^ '>' ] +
  93       { Mdoctype }
  94
  95 and scan_element = parse
  96   | ">"
  97       { Relement }
  98   | ws+
  99       { Space (String.length (Lexing.lexeme lexbuf)) }
 100   | name
 101       { Name (Lexing.lexeme lexbuf) }
 102   | "="
 103       { Is }
 104   | string_literal1
 105       { let s = Lexing.lexeme lexbuf in
 106         Literal (String.sub s 1 (String.length s - 2))
 107       }
 108   | string_literal2
 109       { let s = Lexing.lexeme lexbuf in
 110         Literal (String.sub s 1 (String.length s - 2))
 111       }
 112   | eof
 113       { Eof }
 114   | _
 115       { Other }
 116
 117 (* ======================================================================
 118  * History:
 119  *
 120  * $Log$
 121  * Revision 1.1  2000/11/17 09:57:28  lpadovan
 122  * Initial revision
 123  *
 124  * Revision 1.1  2000/03/03 01:07:25  gerd
 125  *      Initial revision.
 126  *
 127  *
 128  *)