(* Copyright (C) 2004, HELM Team. * * This file is part of HELM, an Hypertextual, Electronic * Library of Mathematics, developed at the Computer Science * Department, University of Bologna, Italy. * * HELM is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * HELM is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HELM; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, * MA 02111-1307, USA. * * For details, see the HELM World-Wide-Web page, * http://helm.cs.unibo.it/ *) exception Error of int * int * string exception Not_an_extended_ident let regexp alpha = [ 'a' - 'z' 'A' - 'Z' ] let regexp digit = [ '0' - '9' ] let regexp blank = [ ' ' '\t' '\n' ] let regexp paren = [ '(' '[' '{' ')' ']' '}' ] let regexp implicit = '?' let regexp placeholder = '%' let regexp symbol_char = [^ 'a' - 'z' 'A' - 'Z' '0' - '9' ' ' '\t' '\n' '\\' '(' '[' '{' ')' ']' '}' '?' ] let regexp comment_char = [^'*'] | '*'[^')'] let regexp note = "(*" ([^'*'] | "**") comment_char* "*)" let regexp commentbegin = "(**" blank let regexp commentend = "*)" let regexp blanks = blank+ let regexp num = digit+ let regexp tex_token = '\\' alpha+ let regexp symbol = symbol_char let regexp ident_cont = alpha | num | '_' | '\'' let regexp ident_cont' = ident_cont | tex_token let regexp ident = (alpha ident_cont*) | ('_' ident_cont+) let regexp ident' = ((alpha | tex_token) ident_cont'*) | ('_' ident_cont'+) let regexp uri_step = (alpha (ident_cont|'-')*) let regexp meta = implicit num let regexp qstring = '"' [^ '"']* '"' let regexp uri_suffix = "con"|"ind"|"var" let regexp uri = ("cic:/" | "theory:/") (* schema *) uri_step ('/' uri_step)* (* path *) ('.' uri_suffix)+ (* ext *) ("#xpointer(" num ('/' num)+ ")")? (* xpointer *) (* let regexp catchall = .* *) let keywords = Hashtbl.create 17 let _ = List.iter (fun keyword -> Hashtbl.add keywords keyword ("", keyword)) [ "Prop"; "Type"; "Set"; "let"; "Let"; "rec"; "using"; "match"; "with"; "in"; "and"; "to"; "as"; "on"; "names"] let error lexbuf msg = raise (Error (Ulexing.lexeme_start lexbuf, Ulexing.lexeme_end lexbuf, msg)) let error_at_end lexbuf msg = raise (Error (Ulexing.lexeme_end lexbuf, Ulexing.lexeme_end lexbuf, msg)) let return lexbuf token = let flocation_begin = { Lexing.pos_fname = ""; Lexing.pos_lnum = -1; Lexing.pos_bol = -1; Lexing.pos_cnum = Ulexing.lexeme_start lexbuf } in let flocation_end = { flocation_begin with Lexing.pos_cnum = Ulexing.lexeme_end lexbuf } in (token, (flocation_begin, flocation_end)) (* let parse_ext_ident ident = let len = String.length ident in let buf = Buffer.create len in let in_tex_token = ref false in let tex_token = Buffer.create 10 in try for i = 0 to len - 1 do match ident.[i] with | '\' when not !in_tex_token -> if i < len - 1 && in_tex_token := true done with Invalid_argument -> assert false let rec token' = lexer | ident' -> (try let ident = parse_ext_ident (Ulexing.utf8_lexeme lexbuf) in return lexbuf ("IDENT'", ident) with Not_an_extended_ident -> Ulexing.rollback lexbuf; token lexbuf) | _ -> Ulexing.rollback lexbuf; token lexbuf and token = lexer *) let remove_quotes s = String.sub s 1 (String.length s - 2) let rec token comments = lexer | blanks -> token comments lexbuf | uri -> return lexbuf ("URI", Ulexing.utf8_lexeme lexbuf) | ident -> let lexeme = Ulexing.utf8_lexeme lexbuf in (try return lexbuf (Hashtbl.find keywords lexeme) with Not_found -> return lexbuf ("IDENT", lexeme)) | num -> return lexbuf ("NUM", Ulexing.utf8_lexeme lexbuf) | paren -> return lexbuf ("PAREN", Ulexing.utf8_lexeme lexbuf) | meta -> return lexbuf ("META", Ulexing.utf8_lexeme lexbuf) | implicit -> return lexbuf ("IMPLICIT", Ulexing.utf8_lexeme lexbuf) | placeholder -> return lexbuf ("PLACEHOLDER", Ulexing.utf8_lexeme lexbuf) | qstring -> return lexbuf ("QSTRING", remove_quotes (Ulexing.utf8_lexeme lexbuf)) | symbol -> return lexbuf ("SYMBOL", Ulexing.utf8_lexeme lexbuf) | tex_token -> let macro = Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1) in (try return lexbuf ("SYMBOL", Utf8Macro.expand macro) with Utf8Macro.Macro_not_found _ -> return lexbuf ("SYMBOL", Ulexing.utf8_lexeme lexbuf)) | note -> (*if comments then*) let comment = Ulexing.utf8_sub_lexeme lexbuf 2 (Ulexing.lexeme_length lexbuf - 4) in return lexbuf ("NOTE", comment) (*else token comments lexbuf*) | commentbegin -> return lexbuf ("BEGINCOMMENT","") | commentend -> return lexbuf ("ENDCOMMENT","") | eof -> return lexbuf ("EOI", "") | _ -> error lexbuf "Invalid character" let tok_func comments stream = let lexbuf = Ulexing.from_utf8_stream stream in Token.make_stream_and_flocation (fun () -> try token comments lexbuf with | Ulexing.Error -> error_at_end lexbuf "Unexpected character" | Ulexing.InvalidCodepoint i -> error_at_end lexbuf "Invalid code point") let cic_lexer ?(comments = false) () = { Token.tok_func = tok_func comments; Token.tok_using = (fun _ -> ()); Token.tok_removing = (fun _ -> ()); Token.tok_match = Token.default_match; Token.tok_text = Token.lexer_text; Token.tok_comm = None; }