X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2Focaml%2Fcic_notation%2FcicNotationLexer.ml;fp=helm%2Focaml%2Fcic_notation%2FcicNotationLexer.ml;h=0000000000000000000000000000000000000000;hb=9a0e4f3be9f70662f18d2d3b6dd60ae79fba565b;hp=33fb8fd78494085a819c9d65fb11e2020a654c42;hpb=f59550b5a9cdddbb348697201fae7d736d6b96c5;p=helm.git diff --git a/helm/ocaml/cic_notation/cicNotationLexer.ml b/helm/ocaml/cic_notation/cicNotationLexer.ml deleted file mode 100644 index 33fb8fd78..000000000 --- a/helm/ocaml/cic_notation/cicNotationLexer.ml +++ /dev/null @@ -1,351 +0,0 @@ -(* Copyright (C) 2005, HELM Team. - * - * This file is part of HELM, an Hypertextual, Electronic - * Library of Mathematics, developed at the Computer Science - * Department, University of Bologna, Italy. - * - * HELM is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * HELM is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with HELM; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, - * MA 02111-1307, USA. - * - * For details, see the HELM World-Wide-Web page, - * http://helm.cs.unibo.it/ - *) - -open Printf - -exception Error of int * int * string - -let regexp number = xml_digit+ - - (* ZACK: breaks unicode's binder followed by an ascii letter without blank *) -(* let regexp ident_letter = xml_letter *) - -let regexp ident_letter = [ 'a' - 'z' 'A' - 'Z' ] - - (* must be in sync with "is_ligature_char" below *) -let regexp ligature_char = [ "'`~!?@*()[]<>-+=|:;.,/\"" ] -let regexp ligature = ligature_char ligature_char+ - -let is_ligature_char = - (* must be in sync with "regexp ligature_char" above *) - let chars = "'`~!?@*()[]<>-+=|:;.,/\"" in - (fun char -> - (try - ignore (String.index chars char); - true - with Not_found -> false)) - -let regexp ident_decoration = '\'' | '?' | '`' -let regexp ident_cont = ident_letter | xml_digit | '_' -let regexp ident = ident_letter ident_cont* ident_decoration* - -let regexp tex_token = '\\' ident - -let regexp delim_begin = "\\[" -let regexp delim_end = "\\]" - -let regexp qkeyword = "'" ident "'" - -let regexp implicit = '?' -let regexp placeholder = '%' -let regexp meta = implicit number - -let regexp csymbol = '\'' ident - -let regexp begin_group = "@{" | "${" -let regexp end_group = '}' -let regexp wildcard = "$_" -let regexp ast_ident = "@" ident -let regexp ast_csymbol = "@" csymbol -let regexp meta_ident = "$" ident -let regexp meta_anonymous = "$_" -let regexp qstring = '"' [^ '"']* '"' - -let regexp begincomment = "(**" xml_blank -let regexp beginnote = "(*" -let regexp endcomment = "*)" -(* let regexp comment_char = [^'*'] | '*'[^')'] -let regexp note = "|+" ([^'*'] | "**") comment_char* "+|" *) - -let level1_layouts = - [ "sub"; "sup"; - "below"; "above"; - "over"; "atop"; "frac"; - "sqrt"; "root" - ] - -let level1_keywords = - [ "hbox"; "hvbox"; "hovbox"; "vbox"; - "break"; - "list0"; "list1"; "sep"; - "opt"; - "term"; "ident"; "number" - ] @ level1_layouts - -let level2_meta_keywords = - [ "if"; "then"; "else"; - "fold"; "left"; "right"; "rec"; - "fail"; - "default"; - "anonymous"; "ident"; "number"; "term"; "fresh" - ] - - (* (string, unit) Hashtbl.t, to exploit multiple bindings *) -let level2_ast_keywords = Hashtbl.create 23 -let _ = - List.iter (fun k -> Hashtbl.add level2_ast_keywords k ()) - [ "CProp"; "Prop"; "Type"; "Set"; "let"; "rec"; "corec"; "match"; - "with"; "in"; "and"; "to"; "as"; "on"; "return" ] - -let add_level2_ast_keyword k = Hashtbl.add level2_ast_keywords k () -let remove_level2_ast_keyword k = Hashtbl.remove level2_ast_keywords k - - (* (string, int) Hashtbl.t, with multiple bindings. - * int is the unicode codepoint *) -let ligatures = Hashtbl.create 23 -let _ = - List.iter - (fun (ligature, symbol) -> Hashtbl.add ligatures ligature symbol) - [ ("->", <:unicode>); ("=>", <:unicode>); - ("<=", <:unicode>); (">=", <:unicode>); - ("<>", <:unicode>); (":=", <:unicode>); - ] - -let regexp uri_step = [ 'a' - 'z' 'A' - 'Z' '0' - '9' '_' '-' ]+ - -let regexp uri = - ("cic:/" | "theory:/") (* schema *) -(* ident ('/' ident)* |+ path +| *) - uri_step ('/' uri_step)* (* path *) - ('.' ident)+ (* ext *) - ("#xpointer(" number ('/' number)+ ")")? (* xpointer *) - -let error lexbuf msg = - let begin_cnum, end_cnum = Ulexing.loc lexbuf in - raise (Error (begin_cnum, end_cnum, msg)) -let error_at_end lexbuf msg = - let begin_cnum, end_cnum = Ulexing.loc lexbuf in - raise (Error (begin_cnum, end_cnum, msg)) - -let return_with_loc token begin_cnum end_cnum = - (* TODO handle line/column numbers *) - let flocation_begin = - { Lexing.pos_fname = ""; - Lexing.pos_lnum = -1; Lexing.pos_bol = -1; - Lexing.pos_cnum = begin_cnum } - in - let flocation_end = { flocation_begin with Lexing.pos_cnum = end_cnum } in - (token, (flocation_begin, flocation_end)) - -let return lexbuf token = - let begin_cnum, end_cnum = Ulexing.loc lexbuf in - return_with_loc token begin_cnum end_cnum - -let return_lexeme lexbuf name = return lexbuf (name, Ulexing.utf8_lexeme lexbuf) - -let return_symbol lexbuf s = return lexbuf ("SYMBOL", s) -let return_eoi lexbuf = return lexbuf ("EOI", "") - -let remove_quotes s = String.sub s 1 (String.length s - 2) - -let mk_lexer token = - let tok_func stream = -(* let lexbuf = Ulexing.from_utf8_stream stream in *) -(** XXX Obj.magic rationale. - * The problem. - * camlp4 constraints the tok_func field of Token.glexer to have type: - * Stream.t char -> (Stream.t 'te * flocation_function) - * In order to use ulex we have (in theory) to instantiate a new lexbuf each - * time a char Stream.t is passed, destroying the previous lexbuf which may - * have consumed a character from the old stream which is lost forever :-( - * The "solution". - * Instead of passing to camlp4 a char Stream.t we pass a lexbuf, casting it to - * char Stream.t with Obj.magic where needed. - *) - let lexbuf = Obj.magic stream in - Token.make_stream_and_flocation - (fun () -> - try - token lexbuf - with - | Ulexing.Error -> error_at_end lexbuf "Unexpected character" - | Ulexing.InvalidCodepoint p -> - error_at_end lexbuf (sprintf "Invalid code point: %d" p)) - in - { - Token.tok_func = tok_func; - Token.tok_using = (fun _ -> ()); - Token.tok_removing = (fun _ -> ()); - Token.tok_match = Token.default_match; - Token.tok_text = Token.lexer_text; - Token.tok_comm = None; - } - -let expand_macro lexbuf = - let macro = - Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1) - in - try - ("SYMBOL", Utf8Macro.expand macro) - with Utf8Macro.Macro_not_found _ -> "SYMBOL", Ulexing.utf8_lexeme lexbuf - -let remove_quotes s = String.sub s 1 (String.length s - 2) -let remove_left_quote s = String.sub s 1 (String.length s - 1) - -let rec level2_pattern_token_group counter buffer = - lexer - | end_group -> - if (counter > 0) then - Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; - snd (Ulexing.loc lexbuf) - | begin_group -> - Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; - ignore (level2_pattern_token_group (counter + 1) buffer lexbuf) ; - level2_pattern_token_group counter buffer lexbuf - | _ -> - Buffer.add_string buffer (Ulexing.utf8_lexeme lexbuf) ; - level2_pattern_token_group counter buffer lexbuf - -let read_unparsed_group token_name lexbuf = - let buffer = Buffer.create 16 in - let begin_cnum, _ = Ulexing.loc lexbuf in - let end_cnum = level2_pattern_token_group 0 buffer lexbuf in - return_with_loc (token_name, Buffer.contents buffer) begin_cnum end_cnum - -let rec level2_meta_token = - lexer - | xml_blank+ -> level2_meta_token lexbuf - | ident -> - let s = Ulexing.utf8_lexeme lexbuf in - begin - if List.mem s level2_meta_keywords then - return lexbuf ("", s) - else - return lexbuf ("IDENT", s) - end - | "@{" -> read_unparsed_group "UNPARSED_AST" lexbuf - | ast_ident -> - return lexbuf ("UNPARSED_AST", - remove_left_quote (Ulexing.utf8_lexeme lexbuf)) - | ast_csymbol -> - return lexbuf ("UNPARSED_AST", - remove_left_quote (Ulexing.utf8_lexeme lexbuf)) - | eof -> return_eoi lexbuf - -let rec comment_token acc depth = - lexer - | beginnote -> - let acc = acc ^ Ulexing.utf8_lexeme lexbuf in - comment_token acc (depth + 1) lexbuf - | endcomment -> - let acc = acc ^ Ulexing.utf8_lexeme lexbuf in - if depth = 0 - then acc - else comment_token acc (depth - 1) lexbuf - | _ -> - let acc = acc ^ Ulexing.utf8_lexeme lexbuf in - comment_token acc depth lexbuf - - (** @param k continuation to be invoked when no ligature has been found *) -let rec ligatures_token k = - lexer - | ligature -> - let lexeme = Ulexing.utf8_lexeme lexbuf in - (match List.rev (Hashtbl.find_all ligatures lexeme) with - | [] -> (* ligature not found, rollback and try default lexer *) - Ulexing.rollback lexbuf; - k lexbuf - | default_lig :: _ -> (* ligatures found, use the default one *) - return_symbol lexbuf default_lig) - | eof -> return_eoi lexbuf - | _ -> (* not a ligature, rollback and try default lexer *) - Ulexing.rollback lexbuf; - k lexbuf - -and level2_ast_token = - lexer - | xml_blank+ -> ligatures_token level2_ast_token lexbuf - | meta -> return lexbuf ("META", Ulexing.utf8_lexeme lexbuf) - | implicit -> return lexbuf ("IMPLICIT", "") - | placeholder -> return lexbuf ("PLACEHOLDER", "") - | ident -> - let lexeme = Ulexing.utf8_lexeme lexbuf in - if Hashtbl.mem level2_ast_keywords lexeme then - return lexbuf ("", lexeme) - else - return lexbuf ("IDENT", lexeme) - | number -> return lexbuf ("NUMBER", Ulexing.utf8_lexeme lexbuf) - | tex_token -> return lexbuf (expand_macro lexbuf) - | uri -> return lexbuf ("URI", Ulexing.utf8_lexeme lexbuf) - | qstring -> - return lexbuf ("QSTRING", remove_quotes (Ulexing.utf8_lexeme lexbuf)) - | csymbol -> - return lexbuf ("CSYMBOL", remove_left_quote (Ulexing.utf8_lexeme lexbuf)) - | "${" -> read_unparsed_group "UNPARSED_META" lexbuf - | "@{" -> read_unparsed_group "UNPARSED_AST" lexbuf - | '(' -> return lexbuf ("LPAREN", "") - | ')' -> return lexbuf ("RPAREN", "") - | meta_ident -> - return lexbuf ("UNPARSED_META", - remove_left_quote (Ulexing.utf8_lexeme lexbuf)) - | meta_anonymous -> return lexbuf ("UNPARSED_META", "anonymous") - | beginnote -> - let comment = comment_token (Ulexing.utf8_lexeme lexbuf) 0 lexbuf in -(* let comment = - Ulexing.utf8_sub_lexeme lexbuf 2 (Ulexing.lexeme_length lexbuf - 4) - in - return lexbuf ("NOTE", comment) *) - ligatures_token level2_ast_token lexbuf - | begincomment -> return lexbuf ("BEGINCOMMENT","") - | endcomment -> return lexbuf ("ENDCOMMENT","") - | eof -> return_eoi lexbuf - | _ -> return_symbol lexbuf (Ulexing.utf8_lexeme lexbuf) - -and level1_pattern_token = - lexer - | xml_blank+ -> ligatures_token level1_pattern_token lexbuf - | number -> return lexbuf ("NUMBER", Ulexing.utf8_lexeme lexbuf) - | ident -> - let s = Ulexing.utf8_lexeme lexbuf in - begin - if List.mem s level1_keywords then - return lexbuf ("", s) - else - return lexbuf ("IDENT", s) - end - | tex_token -> return lexbuf (expand_macro lexbuf) - | qkeyword -> - return lexbuf ("QKEYWORD", remove_quotes (Ulexing.utf8_lexeme lexbuf)) - | '(' -> return lexbuf ("LPAREN", "") - | ')' -> return lexbuf ("RPAREN", "") - | eof -> return_eoi lexbuf - | _ -> return_symbol lexbuf (Ulexing.utf8_lexeme lexbuf) - -let level1_pattern_token = ligatures_token level1_pattern_token -let level2_ast_token = ligatures_token level2_ast_token - -(* API implementation *) - -let level1_pattern_lexer = mk_lexer level1_pattern_token -let level2_ast_lexer = mk_lexer level2_ast_token -let level2_meta_lexer = mk_lexer level2_meta_token - -let lookup_ligatures lexeme = - try - if lexeme.[0] = '\\' - then [ Utf8Macro.expand (String.sub lexeme 1 (String.length lexeme - 1)) ] - else List.rev (Hashtbl.find_all ligatures lexeme) - with Invalid_argument _ | Utf8Macro.Macro_not_found _ as exn -> [] -