(* Copyright (C) 2005, HELM Team. * * This file is part of HELM, an Hypertextual, Electronic * Library of Mathematics, developed at the Computer Science * Department, University of Bologna, Italy. * * HELM is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * HELM is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HELM; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, * MA 02111-1307, USA. * * For details, see the HELM World-Wide-Web page, * http://helm.cs.unibo.it/ *) (* $Id: cicNotationLexer.ml 11231 2011-03-30 11:52:27Z ricciott $ *) open Printf exception Error of int * int * string module StringSet = Set.Make(String) (* Lexer *) let regexp number = xml_digit+ let regexp utf8_blank = " " | "\r\n" | "\n" | "\t" | [160] (* this is a nbsp *) let regexp percentage = ('-' | "") [ '0' - '9' ] + '%' let regexp floatwithunit = ('-' | "") [ '0' - '9' ] + ["."] [ '0' - '9' ] + ([ 'a' - 'z' ] + | "" ) let regexp color = "#" [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] [ '0' - '9' 'a' - 'f' 'A' - 'F' ] (* ZACK: breaks unicode's binder followed by an ascii letter without blank *) (* let regexp ident_letter = xml_letter *) let regexp ident_letter = [ 'a' - 'z' 'A' - 'Z' ] (* must be in sync with "is_ligature_char" below *) let regexp ligature_char = [ "'`~!?@*()[]<>-+=|:;.,/\"" ] let regexp ligature = ligature_char ligature_char+ let regexp we_proved = "we" utf8_blank+ "proved" let regexp we_have = "we" utf8_blank+ "have" let regexp let_rec = "let" utf8_blank+ "rec" let regexp let_corec = "let" utf8_blank+ "corec" let regexp ident_decoration = '\'' | '?' | '`' let regexp ident_cont = ident_letter | xml_digit | '_' let regexp ident_start = ident_letter let regexp ident = ident_letter ident_cont* ident_decoration* let regexp variable_ident = '_' '_' number let regexp pident = '_' ident let regexp uri_step = [ 'a' - 'z' 'A' - 'Z' '0' - '9' '_' '-' ''' ]+ let regexp uri = ("cic:/" | "theory:/") (* schema *) (* ident ('/' ident)* |+ path +| *) uri_step ('/' uri_step)* (* path *) ('.' ident)+ (* ext *) (* ("#xpointer(" number ('/' number)+ ")")? (* xpointer *) *) ("(" number (',' number)* ")")? (* reference spec *) let regexp qstring = '"' [^ '"']* '"' let regexp hreftag = "<" [ 'A' 'a' ] let regexp href = "href=\"" uri "\"" let regexp hreftitle = "title=" qstring let regexp hrefclose = "" let regexp tex_token = '\\' ident let regexp delim_begin = "\\[" let regexp delim_end = "\\]" let regexp qkeyword = "'" ( ident | pident ) "'" let regexp implicit = '?' let regexp placeholder = '%' let regexp meta = implicit number let regexp csymbol = '\'' ident let regexp begin_group = "@{" | "${" let regexp end_group = '}' let regexp wildcard = "$_" let regexp ast_ident = "@" ident let regexp ast_csymbol = "@" csymbol let regexp meta_ident = "$" ident let regexp meta_anonymous = "$_" let regexp begincom = "(*" let regexp endcom = "*)" (* let regexp comment_char = [^'*'] | '*'[^')'] let regexp note = "|+" ([^'*'] | "**") comment_char* "+|" *) let level1_layouts = [ "sub"; "sup"; "below"; "above"; "over"; "atop"; "frac"; "sqrt"; "root"; "mstyle" ; "mpadded"; "maction" ] let level1_keywords = [ "hbox"; "hvbox"; "hovbox"; "vbox"; "break"; "list0"; "list1"; "sep"; "opt"; "term"; "ident"; "number"; ] @ level1_layouts let level2_meta_keywords = [ "if"; "then"; "elCicNotationParser.se"; "fold"; "left"; "right"; "rec"; "fail"; "default"; "anonymous"; "ident"; "number"; "term"; "fresh" ] (* (string, int) Hashtbl.t, with multiple bindings. * int is the unicode codepoint *) let ligatures = Hashtbl.create 23 let _ = List.iter (fun (ligature, symbol) -> Hashtbl.add ligatures ligature symbol) [ ("->", <:unicode>); ("=>", <:unicode>); (":=", <:unicode>); ] let regexp nreference = "cic:/" (* schema *) uri_step ('/' uri_step)* (* path *) '.' ( "dec" | "def" "(" number ")" | "fix" "(" number "," number "," number ")" | "cfx" "(" number ")" | "ind" "(" number "," number "," number ")" | "con" "(" number "," number "," number ")") (* ext + reference *) let error lexbuf msg = let begin_cnum, end_cnum = Ulexing.loc lexbuf in raise (Error (begin_cnum, end_cnum, msg)) let error_at_end lexbuf msg = let begin_cnum, end_cnum = Ulexing.loc lexbuf in raise (Error (begin_cnum, end_cnum, msg)) let loc_of_buf lexbuf = HExtlib.floc_of_loc (Ulexing.loc lexbuf) let return_with_loc token begin_cnum end_cnum = let flocation = HExtlib.floc_of_loc (begin_cnum,end_cnum) in token, flocation let return lexbuf token = let begin_cnum, end_cnum = Ulexing.loc lexbuf in return_with_loc token begin_cnum end_cnum let return_lexeme lexbuf name = return lexbuf (name, Ulexing.utf8_lexeme lexbuf) let return_symbol lexbuf s = return lexbuf ("SYMBOL", s) let return_eoi lexbuf = return lexbuf ("EOI", "") let remove_quotes s = String.sub s 1 (String.length s - 2) let mk_lexer token = let tok_func stream = (* let lexbuf = Ulexing.from_utf8_stream stream in *) (** XXX Obj.magic rationale. * The problem. * camlp5 constraints the tok_func field of Token.glexer to have type: * Stream.t char -> (Stream.t 'te * flocation_function) * In order to use ulex we have (in theory) to instantiate a new lexbuf each * time a char Stream.t is passed, destroying the previous lexbuf which may * have consumed a character from the old stream which is lost forever :-( * The "solution". * Instead of passing to camlp5 a char Stream.t we pass a lexbuf, casting it to * char Stream.t with Obj.magic where needed. *) let lexbuf = Obj.magic stream in Token.make_stream_and_location (fun () -> try token lexbuf with | Ulexing.Error -> error_at_end lexbuf "Unexpected character" | Ulexing.InvalidCodepoint p -> error_at_end lexbuf (sprintf "Invalid code point: %d" p)) in { Token.tok_func = tok_func; Token.tok_using = (fun _ -> ()); Token.tok_removing = (fun _ -> ()); Token.tok_match = Token.default_match; Token.tok_text = Token.lexer_text; Token.tok_comm = None; } let expand_macro lexbuf = let macro = Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1) in try ("SYMBOL", Utf8Macro.expand macro) with Utf8Macro.Macro_not_found _ -> (* FG: unexpanded TeX macros are terminated by a space for rendering *) "SYMBOL", (Ulexing.utf8_lexeme lexbuf ^ " ") let remove_quotes s = String.sub s 1 (String.length s - 2) let remove_left_quote s = String.sub s 1 (String.length s - 1) let rec level2_pattern_token_group counter buffer = lexer | end_group -> if (counter > 0) then Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; snd (Ulexing.loc lexbuf) | begin_group -> Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; ignore (level2_pattern_token_group (counter + 1) buffer lexbuf) ; level2_pattern_token_group counter buffer lexbuf | _ -> Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; level2_pattern_token_group counter buffer lexbuf let read_unparsed_group token_name lexbuf = let buffer = Buffer.create 16 in let begin_cnum, _ = Ulexing.loc lexbuf in let end_cnum = level2_pattern_token_group 0 buffer lexbuf in return_with_loc (token_name, Buffer.contents buffer) begin_cnum end_cnum let handle_keywords lexbuf k name = let s = Ulexing.utf8_lexeme lexbuf in if k s then return lexbuf ("", s) else return lexbuf (name, s) ;; let rec level2_meta_token = lexer | utf8_blank+ -> level2_meta_token lexbuf | hreftag -> return lexbuf ("ATAG","") | hrefclose -> return lexbuf ("ATAGEND","") | ident -> handle_keywords lexbuf (fun x -> List.mem x level2_meta_keywords) "IDENT" | variable_ident -> return lexbuf ("IDENT", Ulexing.utf8_lexeme lexbuf) | pident -> handle_keywords lexbuf (fun x -> List.mem x level2_meta_keywords) "PIDENT" | "@{" -> read_unparsed_group "UNPARSED_AST" lexbuf | ast_ident -> return lexbuf ("UNPARSED_AST", remove_left_quote (Ulexing.utf8_lexeme lexbuf)) | ast_csymbol -> return lexbuf ("UNPARSED_AST", remove_left_quote (Ulexing.utf8_lexeme lexbuf)) | eof -> return_eoi lexbuf (** @param k continuation to be invoked when no ligature has been found *) let ligatures_token k = lexer | ligature -> let lexeme = Ulexing.utf8_lexeme lexbuf in (match List.rev (Hashtbl.find_all ligatures lexeme) with | [] -> (* ligature not found, rollback and try default lexer *) Ulexing.rollback lexbuf; k lexbuf | default_lig :: _ -> (* ligatures found, use the default one *) return_symbol lexbuf default_lig) | eof -> return_eoi lexbuf | _ -> (* not a ligature, rollback and try default lexer *) Ulexing.rollback lexbuf; k lexbuf let so_pp = function | None -> "()" | Some s -> s ;; (* let update_table loc desc href loctable = if desc <> None || href <> None then (let s,e = HExtlib.loc_of_floc loc in prerr_endline (Printf.sprintf "*** [%d,%d] \"%s\",\"%s\"" s e (so_pp href) (so_pp desc)); CicNotationLexer.LocalizeEnv.add loc (href,desc) loctable) else loctable ;; *) let get_hot_spots = (* k = lexing continuation *) let rec aux nesting k = lexer | begincom -> aux (nesting+1) k lexbuf | endcom -> aux (max 0 (nesting-1)) k lexbuf | eof -> [] | _ -> if nesting > 0 then aux nesting k lexbuf else (Ulexing.rollback lexbuf; k lexbuf) and aux_basic loc1 desc href = lexer | hreftag -> aux 0 (aux_in_tag (Ulexing.loc lexbuf) None None) lexbuf | hrefclose -> (try let loc1 = HExtlib.floc_of_loc (HExtlib.unopt loc1) in let loc2 = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in (loc1,loc2,href,desc) :: aux 0 (aux_basic None None None) lexbuf with Failure _ -> aux 0 (aux_basic None None None) lexbuf) | eof -> [] | _ -> aux 0 (aux_basic loc1 desc href) lexbuf and aux_in_tag loc1 desc href = lexer | utf8_blank+ -> aux 0 (aux_in_tag loc1 desc href) lexbuf | href -> aux 0 (aux_in_tag loc1 desc (Some (Ulexing.utf8_sub_lexeme lexbuf 6 (Ulexing.lexeme_length lexbuf - 7)))) lexbuf | hreftitle -> aux 0 (aux_in_tag loc1 (Some (Ulexing.utf8_sub_lexeme lexbuf 7 (Ulexing.lexeme_length lexbuf - 8))) href) lexbuf | ">" -> let merge (a,b) (c,d) = (a,d) in aux 0 (aux_basic (Some (merge loc1 (Ulexing.loc lexbuf))) desc href) lexbuf | _ -> aux 0 (aux_basic None None None) lexbuf in aux 0 (aux_basic None None None) let get_hot_spots s = get_hot_spots (Ulexing.from_utf8_string s) (*let xmarkup = "\005" let ymarkup = "\006" let regexp tag_cont = ident_letter | xml_digit | "_" | "-" let regexp tagname = ident_letter tag_cont* let regexp opentag = xmarkup tagname let regexp closetag = xmarkup "/" tagname ymarkup let regexp attribute = tagname "=" qstring let rec to_xy = lexer | closetag -> xmarkup ^ ymarkup ^ xmarkup ^ to_xy lexbuf | opentag -> let tag = Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1) in xmarkup ^ ymarkup ^ tag ^ to_xy_inner lexbuf | eof -> "" | _ -> let lexeme = Ulexing.utf8_lexeme lexbuf in prerr_endline ("matched " ^ lexeme); lexeme ^ to_xy lexbuf and to_xy_inner = lexer | utf8_blank+ -> to_xy_inner lexbuf | attribute -> let lexeme = Ulexing.utf8_lexeme lexbuf in ymarkup ^ lexeme ^ to_xy_inner lexbuf | ">" -> xmarkup ^ to_xy lexbuf | eof -> "" | _ -> to_xy lexbuf ;; let to_xy s = to_xy (Ulexing.from_utf8_string s)*)