exception Error of int * int * string let regexp alpha = [ 'a' - 'z' 'A' - 'Z' ] let regexp digit = [ '0' - '9' ] let regexp blank = [ ' ' '\t' '\n' ] let regexp blanks = blank+ let regexp num = digit+ let regexp ident = alpha (alpha | num)* let regexp symbol = [^ 'a' - 'z' 'A' - 'Z' '0' - '9' ' ' '\t' '\n' ] let regexp tex_token = '\\' alpha+ let regexp lparen = [ '(' '[' '{' ] let regexp rparen = [ ')' ']' '}' ] let regexp meta = '?' (alpha | num)+ let keywords = Hashtbl.create 17 let _ = List.iter (fun keyword -> Hashtbl.add keywords keyword ("", keyword)) [ "Prop"; "Type"; "Set"; "let"; "rec"; "using"; "match"; "with" ] let error lexbuf msg = raise (Error (Ulexing.lexeme_start lexbuf, Ulexing.lexeme_end lexbuf, msg)) let error_at_end lexbuf msg = raise (Error (Ulexing.lexeme_end lexbuf, Ulexing.lexeme_end lexbuf, msg)) let return lexbuf token = (token, Ulexing.loc lexbuf) let rec token = lexer | blanks -> token lexbuf | ident -> let lexeme = Ulexing.utf8_lexeme lexbuf in (try return lexbuf (Hashtbl.find keywords lexeme) with Not_found -> return lexbuf ("IDENT", lexeme)) | num -> return lexbuf ("INT", Ulexing.utf8_lexeme lexbuf) | lparen -> return lexbuf ("LPAREN", Ulexing.utf8_lexeme lexbuf) | rparen -> return lexbuf ("RPAREN", Ulexing.utf8_lexeme lexbuf) | meta -> return lexbuf ("META", Ulexing.utf8_lexeme lexbuf) | symbol -> return lexbuf ("SYMBOL", Ulexing.utf8_lexeme lexbuf) | tex_token -> let macro = Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1) in (try return lexbuf ("SYMBOL", Macro.expand macro) with Macro.Macro_not_found _ -> return lexbuf ("SYMBOL", Ulexing.utf8_lexeme lexbuf)) | eof -> return lexbuf ("EOI", "") | _ -> error lexbuf "Invalid character" let tok_func stream = let lexbuf = Ulexing.from_utf8_stream stream in Token.make_stream_and_location (fun () -> try token lexbuf with | Ulexing.Error -> error_at_end lexbuf "Unexpected character" | Ulexing.InvalidCodepoint i -> error_at_end lexbuf "Invalid code point") let lex = { Token.tok_func = tok_func; Token.tok_using = (fun _ -> ()); Token.tok_removing = (fun _ -> ()); Token.tok_match = Token.default_match; Token.tok_text = Token.lexer_text; Token.tok_comm = None; }