2 * ----------------------------------------------------------------------
8 let comment_re = Str.regexp "#.*$";;
9 let space_re = Str.regexp "[ \t\r\n]+";;
11 let read_unimap_format_a fname f =
12 (* Reads a Unicode mapping in format A from a "local" code to Unicode.
13 * Returns a list of pairs (localcode, unicode).
16 let read_unimap_line() =
17 let s = input_line f in (* may raise End_of_file *)
18 let s' = Str.global_replace comment_re "" s in
19 let words = Str.split space_re s' in
22 | [ localcode; unicode ] ->
23 int_of_string localcode, int_of_string unicode
25 failwith ("File " ^ fname ^ ": Do not know what to do with:\n" ^ s')
28 let rec read_following_lines() =
30 let localcode, unicode = read_unimap_line() in
31 (* may raise End_of_file, Not_found *)
32 (localcode, unicode) :: read_following_lines()
34 Not_found -> read_following_lines()
38 read_following_lines()
44 | U_single of (int * int)
45 | U_list of (int * int) list
49 (* A hashtable with fixed size (256). A pair (unicode, localcode) is
50 * stored at the position unicode mod 256 in the array.
54 let make_bijection unimap =
55 (* unimap: a list of pairs (localcode, unicode)
56 * returns a pair of arrays (m_to_unicode, m_from_unicode) with:
57 * - m_to_unicode.(localcode) = Some unicode,
58 * if the pair (localcode, unicode) exists
59 * m_to_unicode.(x) = None otherwise
60 * - m_from_unicode.(unicode lsr 8) = [ ...; (unicode,localcode); ... ]
63 let m_to_unicode = Array.create 256 None in
64 let m_from_unicode = Array.create 256 [] in
67 (fun (localcode, unicode) ->
68 assert(localcode < 256);
70 (* Update m_to_unicode: *)
71 if m_to_unicode.(localcode) <> None then
72 failwith ("Local code point " ^ string_of_int localcode ^
74 m_to_unicode.(localcode) <- Some unicode;
76 (* Update m_from_unicode: *)
77 let unilow = unicode land 255 in
78 if List.mem_assoc unicode (m_from_unicode.(unilow)) then
79 failwith ("Unicode code point " ^ string_of_int unicode ^
81 m_from_unicode.(unilow) <-
82 m_from_unicode.(unilow) @ [unicode,localcode];
86 m_to_unicode, m_from_unicode
90 let to_unimap_as_string to_unimap =
96 Marshal.to_string (Array.map make_repr to_unimap) [ Marshal.No_sharing ]
100 let from_unimap_as_string from_unimap =
104 | [u,l] -> U_single(u,l)
107 let m = Array.map make_repr from_unimap in
108 Marshal.to_string m [ Marshal.No_sharing ]
112 let print_bijection f name m_to_unicode m_from_unicode =
113 (* Prints on file f this O'Caml code:
114 * let <name>_to_unicode = ...
115 * let <name>_from_unicode = ...
117 fprintf f "let %s_to_unicode = lazy (Marshal.from_string \"%s\" 0 : int array);;\n"
119 (String.escaped (to_unimap_as_string m_to_unicode));
121 fprintf f "let %s_from_unicode = lazy (Marshal.from_string \"%s\" 0 : Netmappings.from_uni_list array);;\n "
123 (String.escaped (from_unimap_as_string m_from_unicode));
128 let files = ref [] in
129 let outch = ref (lazy stdout) in
131 [ "-o", Arg.String (fun s -> outch := lazy (open_out s)),
132 " <file> Write result to this file"]
133 (fun s -> files := !files @ [s])
134 "usage: unimap_to_ocaml file.unimap ...";
136 (* First read in all unimaps: *)
140 let mapname = Str.replace_first (Str.regexp "\.unimap$") ""
141 (Filename.basename filename) in
142 let f = open_in filename in
143 prerr_endline ("Reading " ^ filename);
144 let unimap = read_unimap_format_a filename f in
151 (* Second compute all bijections: *)
154 (fun (mapname, unimap) ->
155 prerr_endline ("Processing " ^ mapname);
156 let to_unicode, from_unicode = make_bijection unimap in
157 mapname, to_unicode, from_unicode
162 let out = Lazy.force !outch in
163 (* Third output all results: *)
164 output_string out "(* WARNING! This is a generated file! *)\n";
167 (fun (mapname, to_unicode, from_unicode) ->
168 print_bijection out mapname to_unicode from_unicode)
171 (fun (mapname, _, _) ->
172 fprintf out "Hashtbl.add Netmappings.to_unicode `Enc_%s %s_to_unicode;\n"
174 fprintf out "Hashtbl.add Netmappings.from_unicode `Enc_%s %s_from_unicode;\n"
177 (List.rev bijections);
178 fprintf out "();;\n";
186 (* ======================================================================
190 * Revision 1.1 2000/11/17 09:57:29 lpadovan
193 * Revision 1.3 2000/08/29 00:48:52 gerd
194 * Conversion tables are now stored in marshalled form.
195 * New type for the conversion table Unicode to 8bit.
197 * Revision 1.2 2000/08/12 23:54:56 gerd