(* $Id$ * ---------------------------------------------------------------------- * *) { type token = Lcomment | Rcomment | Mcomment | Ldoctype | Rdoctype | Mdoctype | Lelement of string | Lelementend of string | Relement | Cdata of string | Space of int | Name of string | Is | Literal of string | Other | Eof } (* Simplified rules: Only Latin-1 is recognized as character set *) let letter = ['A'-'Z' 'a'-'z' '\192'-'\214' '\216'-'\246' '\248'-'\255'] let extender = '\183' let digit = ['0'-'9'] let hexdigit = ['0'-'9' 'A'-'F' 'a'-'f'] let namechar = letter | digit | '.' | ':' | '-' | '_' | extender let name = ( letter | '_' | ':' ) namechar* let nmtoken = namechar+ let ws = [ ' ' '\t' '\r' '\n' ] let string_literal1 = '"' [^ '"' '>' '<' '\n']* '"' let string_literal2 = "'" [^ '\'' '>' '<' '\n']* "'" (* This following rules reflect HTML as it is used, not the SGML * rules. *) rule scan_document = parse | "" { Rcomment } | "-" { Mcomment } | eof { Eof } | [^ '-']+ { Mcomment } and scan_doctype = parse | ">" (* Occurence in strings, and [ ] brackets ignored *) { Rdoctype } | eof { Eof } | [^ '>' ] + { Mdoctype } and scan_element = parse | ">" { Relement } | ws+ { Space (String.length (Lexing.lexeme lexbuf)) } | name { Name (Lexing.lexeme lexbuf) } | "=" { Is } | string_literal1 { let s = Lexing.lexeme lexbuf in Literal (String.sub s 1 (String.length s - 2)) } | string_literal2 { let s = Lexing.lexeme lexbuf in Literal (String.sub s 1 (String.length s - 2)) } | eof { Eof } | _ { Other } (* ====================================================================== * History: * * $Log$ * Revision 1.1 2000/11/17 09:57:28 lpadovan * Initial revision * * Revision 1.1 2000/03/03 01:07:25 gerd * Initial revision. * * *)