1 (* Copyright (C) 2005, HELM Team.
3 * This file is part of HELM, an Hypertextual, Electronic
4 * Library of Mathematics, developed at the Computer Science
5 * Department, University of Bologna, Italy.
7 * HELM is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * HELM is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with HELM; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
22 * For details, see the HELM World-Wide-Web page,
23 * http://helm.cs.unibo.it/
26 (* $Id: cicNotationLexer.ml 11028 2010-11-02 17:08:43Z tassi $ *)
30 exception Error of int * int * string
32 module StringSet = Set.Make(String)
37 let regexp number = xml_digit+
39 let regexp ident_letter = [ 'a' - 'z' 'A' - 'Z' ]
41 (* must be in sync with "is_ligature_char" below *)
42 let regexp ligature_char = [ "'`~!?@*()[]<>-+=|:;.,/\"" ]
43 let regexp ligature = ligature_char ligature_char+
45 let regexp ident_decoration = '\'' | '?' | '`'
46 let regexp ident_cont = ident_letter | xml_digit | '_'
47 let regexp ident_start = ident_letter
48 let regexp ident = ident_letter ident_cont* ident_decoration*
49 let regexp variable_ident = '_' '_' number
50 let regexp pident = '_' ident
52 let regexp tex_token = '\\' ident
54 let regexp csymbol = '\'' ident
56 let regexp begincom = "(*"
57 let regexp endcom = "*)"
58 let regexp qstring = '"' [^ '"']* '"'
63 (* (string, int) Hashtbl.t, with multiple bindings.
64 * int is the unicode codepoint *)
65 let ligatures = Hashtbl.create 23
69 (fun (ligature, symbol) -> Hashtbl.add ligatures ligature symbol)
70 [ ("->", <:unicode<to>>); ("=>", <:unicode<Rightarrow>>);
71 (":=", <:unicode<def>>);
74 let error lexbuf msg =
75 let begin_cnum, end_cnum = Ulexing.loc lexbuf in
76 raise (Error (begin_cnum, end_cnum, msg))
77 let error_at_end lexbuf msg =
78 let begin_cnum, end_cnum = Ulexing.loc lexbuf in
79 raise (Error (begin_cnum, end_cnum, msg))
83 let return_symbol lexbuf s = return lexbuf ("SYMBOL", s)
84 let return_eoi lexbuf = return lexbuf ("EOI", "")
87 let remove_quotes s = String.sub s 1 (String.length s - 2)
91 (* let lexbuf = Ulexing.from_utf8_stream stream in *)
92 (** XXX Obj.magic rationale.
94 * camlp5 constraints the tok_func field of Token.glexer to have type:
95 * Stream.t char -> (Stream.t 'te * flocation_function)
96 * In order to use ulex we have (in theory) to instantiate a new lexbuf each
97 * time a char Stream.t is passed, destroying the previous lexbuf which may
98 * have consumed a character from the old stream which is lost forever :-(
100 * Instead of passing to camlp5 a char Stream.t we pass a lexbuf, casting it to
101 * char Stream.t with Obj.magic where needed.
103 let lexbuf = Obj.magic stream in
104 Token.make_stream_and_location
109 | Ulexing.Error -> error_at_end lexbuf "Unexpected character"
110 | Ulexing.InvalidCodepoint p ->
111 error_at_end lexbuf (sprintf "Invalid code point: %d" p))
114 Token.tok_func = tok_func;
115 Token.tok_using = (fun _ -> ());
116 Token.tok_removing = (fun _ -> ());
117 Token.tok_match = (* Token.default_match *) (fun (a,b) () -> a);
118 Token.tok_text = Token.lexer_text;
119 Token.tok_comm = None;
122 let expand_macro lexbuf =
124 Ulexing.utf8_sub_lexeme lexbuf 1 (Ulexing.lexeme_length lexbuf - 1)
127 ("SYMBOL" ^ (Utf8Macro.expand macro))
128 with Utf8Macro.Macro_not_found _ ->
129 (* FG: unexpanded TeX macros are terminated by a space for rendering *)
130 "SYMBOL" ^ (Ulexing.utf8_lexeme lexbuf ^ " ")
132 let remove_quotes s = String.sub s 1 (String.length s - 2)
133 let remove_left_quote s = String.sub s 1 (String.length s - 1)
135 let print s t = s := (!s ^ t) (* Printf.fprintf *)
140 let handle_full_id ch id uri =
141 let id' = Printf.sprintf "\005A href=\"%s\"\006%s\005/A\006" uri id
145 let handle_full_sym ch s uri desc =
146 let uri = try HExtlib.unopt uri
147 with Failure _ -> "cic:/fakeuri.def(1)" in
148 let s' = Printf.sprintf "\005A title=\"%s\" href=\"%s\"\006%s\005/A\006" desc uri s
152 let handle_full_num ch num uri desc =
153 let uri = try HExtlib.unopt uri
154 with Failure _ -> "cic:/fakeuri.def(1)"
156 let num' = Printf.sprintf "\005A title=\"%s\" href=\"%s\"\006%s\005/A\006" desc uri num
160 let handle_tex ch s =
161 (* print "TEX:%s" (remove_left_quote (Ulexing.utf8_lexeme lexbuf)) *)
164 let return_with_loc token begin_cnum end_cnum =
165 let flocation = HExtlib.floc_of_loc (begin_cnum,end_cnum) in
168 let return lexbuf token =
169 let begin_cnum, end_cnum = Ulexing.loc lexbuf in
170 return_with_loc token begin_cnum end_cnum
172 (** @param k continuation to be invoked when no ligature has been found *)
173 let ligatures_token k =
176 let lexeme = Ulexing.utf8_lexeme lexbuf in
177 (match List.rev (Hashtbl.find_all ligatures lexeme) with
178 | [] -> (* ligature not found, rollback and try default lexer *)
179 Ulexing.rollback lexbuf;
181 | default_lig :: _ -> (* ligatures found, use the default one *)
182 return_symbol lexbuf default_lig) *)
183 | eof -> return lexbuf ()
184 | _ -> (* not a ligature, rollback and try default lexer *)
185 Ulexing.rollback lexbuf;
188 let handle_interpr loc literal interp outstr =
190 (match DisambiguateTypes.InterprEnv.find loc interp with
191 | GrafiteAst.Ident_alias (_,uri) ->
192 handle_full_id outstr literal uri
193 | GrafiteAst.Symbol_alias (_,uri,desc) ->
194 handle_full_sym outstr literal uri desc
195 | GrafiteAst.Number_alias (uri,desc) ->
196 handle_full_num outstr literal uri desc)
197 with Not_found -> print outstr literal
200 let level2_ast_token interp outstr =
201 let rec aux nesting =
203 | begincom -> (print outstr (Ulexing.utf8_lexeme lexbuf);
204 aux (nesting+1) lexbuf)
205 | endcom -> (print outstr (Ulexing.utf8_lexeme lexbuf);
206 aux (max 0 (nesting-1)) lexbuf)
207 | eof -> return lexbuf ()
208 | _ -> if nesting > 0
209 then (print outstr (Ulexing.utf8_lexeme lexbuf);
211 else (Ulexing.rollback lexbuf;
215 | ident -> let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
216 let id = Ulexing.utf8_lexeme lexbuf in
217 handle_interpr loc id interp outstr;
220 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
221 let id = Ulexing.utf8_lexeme lexbuf in
222 handle_interpr loc id interp outstr;
224 | pident -> let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
225 let id = Ulexing.utf8_lexeme lexbuf in
226 handle_interpr loc id interp outstr;
229 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
230 let num = Ulexing.utf8_lexeme lexbuf in
231 handle_interpr loc num interp outstr;
234 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
235 let s = Ulexing.utf8_lexeme lexbuf in
236 handle_interpr loc s interp outstr;
238 | qstring -> (print outstr (Ulexing.utf8_lexeme lexbuf); aux 0 lexbuf)
240 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
241 let s = Ulexing.utf8_lexeme lexbuf in
242 handle_interpr loc s interp outstr;
244 | eof -> return lexbuf ()
246 let loc = HExtlib.floc_of_loc (Ulexing.loc lexbuf) in
247 let s = Ulexing.utf8_lexeme lexbuf in
248 handle_interpr loc s interp outstr;
253 (* API implementation *)
255 (*let mk_small_lexer interpr keywords =
256 let initial_keywords =
257 [ "CProp"; "Prop"; "Type"; "Set"; "let"; "match";
258 "with"; "in"; "and"; "to"; "as"; "on"; "return"; "done";
262 List.fold_right StringSet.add (initial_keywords @ keywords) StringSet.empty
264 mk_lexer (level2_ast_token outstr interpr)
267 let mk_small_printer interpr outstr =
268 let initial_keywords =
269 [ "CProp"; "Prop"; "Type"; "Set"; "let"; "match";
270 "with"; "in"; "and"; "to"; "as"; "on"; "return"; "done";
274 List.fold_right StringSet.add initial_keywords StringSet.empty
276 level2_ast_token interpr outstr
279 (* let mk_small_lexer interpr = mk_small_lexer interpr [];;*)