(* Copyright (C) 2005, HELM Team. * * This file is part of HELM, an Hypertextual, Electronic * Library of Mathematics, developed at the Computer Science * Department, University of Bologna, Italy. * * HELM is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * HELM is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HELM; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, * MA 02111-1307, USA. * * For details, see the HELM World-Wide-Web page, * http://helm.cs.unibo.it/ *) (* $Id: cicNotationLexer.ml 11231 2011-03-30 11:52:27Z ricciott $ *) open Printf exception Error of int * int * string module StringSet = Set.Make(String) (* Lexer *) let regexp number = xml_digit+ let regexp utf8_blank = " " | "\r\n" | "\n" | "\t" | [160] (* this is a nbsp *) let regexp percentage = ('-' | "") [ '0' - '9' ] + '%' let regexp floatwithunit = ('-' | "") [ '0' - '9' ] + ["."] [ '0' - '9' ] + ([ 'a' - 'z' ] + | "" ) let regexp color = "#" [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] (* ZACK: breaks unicode's binder followed by an ascii letter without blank *) (* let regexp ident_letter = xml_letter *) let regexp ident_letter = [ 'a' - 'z' 'A' - 'Z' ] (* must be in sync with "is_ligature_char" below *) let regexp ligature_char = [ "'`~!?@*()[]<>-+=|:;.,/\"" ] let regexp ligature = ligature_char ligature_char+ let regexp we_proved = "we" utf8_blank+ "proved" let regexp we_have = "we" utf8_blank+ "have" let regexp let_rec = "let" utf8_blank+ "rec" let regexp let_corec = "let" utf8_blank+ "corec" let regexp ident_decoration = '\'' | '?' | '`' let regexp ident_cont = ident_letter | xml_digit | '_' let regexp ident_start = ident_letter let regexp ident = ident_letter ident_cont* ident_decoration* let regexp variable_ident = '_' '_' number let regexp pident = '_' ident let regexp uri_step = [ 'a' - 'z' 'A' - 'Z' '0' - '9' '_' '-' ''' ]+ let regexp uri = ("cic:/" | "theory:/") (* schema *) (* ident ('/' ident)* |+ path +| *) uri_step ('/' uri_step)* (* path *) ('.' ident)+ (* ext *) (* ("#xpointer(" number ('/' number)+ ")")? (* xpointer *) *) ("(" number (',' number)* ")")? (* reference spec *) let regexp qstring = '"' [^ '"']* '"' let regexp hreftag = " Hashtbl.add ligatures ligature symbol) [ ("->", <:unicode>); ("=>", <:unicode>); (":=", <:unicode>); ] let regexp nreference = "cic:/" (* schema *) uri_step ('/' uri_step)* (* path *) '.' ( "dec" | "def" "(" number ")" | "fix" "(" number "," number "," number ")" | "cfx" "(" number ")" | "ind" "(" number "," number "," number ")" | "con" "(" number "," number "," number ")") (* ext + reference *) let error lexbuf msg = let begin_cnum, end_cnum = Ulexing.loc lexbuf in raise (Error (begin_cnum, end_cnum, msg)) let error_at_end lexbuf msg = let begin_cnum, end_cnum = Ulexing.loc lexbuf in raise (Error (begin_cnum, end_cnum, msg)) let loc_of_buf lexbuf = HExtlib.floc_of_loc (Ulexing.loc lexbuf) let return_with_loc token begin_cnum end_cnum = let flocation = HExtlib.floc_of_loc (begin_cnum,end_cnum) in token, flocation let return lexbuf token = let begin_cnum, end_cnum = Ulexing.loc lexbuf in return_with_loc token begin_cnum end_cnum let return_lexeme lexbuf name = return lexbuf (name, Ulexing.utf8_lexeme lexbuf) let return_symbol lexbuf s = return lexbuf ("SYMBOL", s) let return_eoi lexbuf = return lexbuf ("EOI", "") let remove_quotes s = String.sub s 1 (String.length s - 2) let mk_lexer token = let tok_func stream = (* let lexbuf = Ulexing.from_utf8_stream stream in *) (** XXX Obj.magic rationale. * The problem. * camlp5 constraints the tok_func field of Token.glexer to have type: * Stream.t char -> (Stream.t 'te * flocation_function) * In order to use ulex we have (in theory) to instantiate a new lexbuf each * time a char Stream.t is passed, destroying the previous lexbuf which may * have consumed a character from the old stream which is lost forever :-( * The "solution". * Instead of passing to camlp5 a char Stream.t we pass a lexbuf, casting it to * char Stream.t with Obj.magic where needed. *) let lexbuf = Obj.magic stream in Token.make_stream_and_location (fun () -> try token lexbuf with | Ulexing.Error -> error_at_end lexbuf "Unexpected character" | Ulexing.InvalidCodepoint p -> error_at_end lexbuf (sprintf "Invalid code point: %d" p)) in { Token.tok_func = tok_func; Token.tok_using = (fun _ -> ()); Token.tok_removing = (fun _ -> ()); Token.tok_match = Token.default_match; Token.tok_text = Token.lexer_text; Token.tok_comm = None; } let expand_macro lexbuf = let macro = Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1) in try ("SYMBOL", Utf8Macro.expand macro) with Utf8Macro.Macro_not_found _ -> (* FG: unexpanded TeX macros are terminated by a space for rendering *) "SYMBOL", (Ulexing.utf8_lexeme lexbuf ^ " ") let remove_quotes s = String.sub s 1 (String.length s - 2) let remove_left_quote s = String.sub s 1 (String.length s - 1) let rec level2_pattern_token_group counter buffer = lexer | end_group -> if (counter > 0) then Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; snd (Ulexing.loc lexbuf) | begin_group -> Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; ignore (level2_pattern_token_group (counter + 1) buffer lexbuf) ; level2_pattern_token_group counter buffer lexbuf | _ -> Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; level2_pattern_token_group counter buffer lexbuf let read_unparsed_group token_name lexbuf = let buffer = Buffer.create 16 in let begin_cnum, _ = Ulexing.loc lexbuf in let end_cnum = level2_pattern_token_group 0 buffer lexbuf in return_with_loc (token_name, Buffer.contents buffer) begin_cnum end_cnum let handle_keywords lexbuf k name = let s = Ulexing.utf8_lexeme lexbuf in if k s then return lexbuf ("", s) else return lexbuf (name, s) ;; let rec level2_meta_token = lexer | utf8_blank+ -> level2_meta_token lexbuf | hreftag -> return lexbuf ("ATAG","") | hrefclose -> return lexbuf ("ATAGEND","") | ident -> handle_keywords lexbuf (fun x -> List.mem x level2_meta_keywords) "IDENT" | variable_ident -> return lexbuf ("IDENT", Ulexing.utf8_lexeme lexbuf) | pident -> handle_keywords lexbuf (fun x -> List.mem x level2_meta_keywords) "PIDENT" | "@{" -> read_unparsed_group "UNPARSED_AST" lexbuf | ast_ident -> return lexbuf ("UNPARSED_AST", remove_left_quote (Ulexing.utf8_lexeme lexbuf)) | ast_csymbol -> return lexbuf ("UNPARSED_AST", remove_left_quote (Ulexing.utf8_lexeme lexbuf)) | eof -> return_eoi lexbuf let rec comment_token acc depth = lexer | beginnote -> let acc = acc ^ Ulexing.utf8_lexeme lexbuf in comment_token acc (depth + 1) lexbuf | endcomment -> let acc = acc ^ Ulexing.utf8_lexeme lexbuf in if depth = 0 then acc else comment_token acc (depth - 1) lexbuf | _ -> let acc = acc ^ Ulexing.utf8_lexeme lexbuf in comment_token acc depth lexbuf (** @param k continuation to be invoked when no ligature has been found *) let ligatures_token k = lexer | ligature -> let lexeme = Ulexing.utf8_lexeme lexbuf in (match List.rev (Hashtbl.find_all ligatures lexeme) with | [] -> (* ligature not found, rollback and try default lexer *) Ulexing.rollback lexbuf; k lexbuf | default_lig :: _ -> (* ligatures found, use the default one *) return_symbol lexbuf default_lig) | eof -> return_eoi lexbuf | _ -> (* not a ligature, rollback and try default lexer *) Ulexing.rollback lexbuf; k lexbuf module LocalizeOD = struct type t = Stdpp.location let compare = Pervasives.compare end let update_table loc desc href loctable = if desc <> None || href <> None then (let s,e = HExtlib.loc_of_floc loc in prerr_endline (Printf.sprintf "*** [%d,%d] \"%s\",\"%s\"" s e (so_pp href) (so_pp desc)); LocalizeEnv.add loc (href,desc) loctable) else loctable ;; let get_hot_spots = let rec aux loc1 desc href = lexer | hreftag -> aux_in_tag (Ulexing.loc lexbuf) None None lexbuf | hrefclose -> try let loc1 = HExtlib.floc_of_loc (HExtlib.unopt loc1) in let loc2 = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in (loc1,loc2,href,desc) :: aux None None None lexbuf with Failure _ -> aux None None None lexbuf | beginnote -> (* FIXME commenti come in smallLexer *) let _comment = comment_token (Ulexing.utf8_lexeme lexbuf) 0 lexbuf in (* let comment = Ulexing.utf8_sub_lexeme lexbuf 2 (Ulexing.lexeme_length lexbuf - 4) in return lexbuf ("NOTE", comment) *) ligatures_token (aux desc href) lexbuf | begincomment -> return lexbuf ("BEGINCOMMENT","") | endcomment -> return lexbuf ("ENDCOMMENT","") | eof -> [] | _ -> aux loc1 desc href lexbuf and aux_in_tag loc1 desc href = lexer | utf8_blank+ -> aux_in_tag loc1 desc href lexbuf | href -> aux_in_tag loc1 desc (Some (Ulexing.utf8_sub_lexeme lexbuf 6 (Ulexing.lexeme_length lexbuf - 7))) lexbuf | hreftitle -> aux_in_tag loc1 (Some (Ulexing.utf8_sub_lexeme lexbuf 7 (Ulexing.lexeme_length lexbuf - 8))) href lexbuf | ">" -> let merge (a,b) (c,d) = (a,d) in aux (Some (merge loc1 (Ulexing.loc lexbuf))) desc href lexbuf | _ -> aux None None None lexbuf in aux None None None