(* $Id$ * ---------------------------------------------------------------------- * *) open Printf;; let comment_re = Str.regexp "#.*$";; let space_re = Str.regexp "[ \t\r\n]+";; let read_unimap_format_a fname f = (* Reads a Unicode mapping in format A from a "local" code to Unicode. * Returns a list of pairs (localcode, unicode). *) let read_unimap_line() = let s = input_line f in (* may raise End_of_file *) let s' = Str.global_replace comment_re "" s in let words = Str.split space_re s' in match words with [] -> raise Not_found | [ localcode; unicode ] -> int_of_string localcode, int_of_string unicode | _ -> failwith ("File " ^ fname ^ ": Do not know what to do with:\n" ^ s') in let rec read_following_lines() = try let localcode, unicode = read_unimap_line() in (* may raise End_of_file, Not_found *) (localcode, unicode) :: read_following_lines() with Not_found -> read_following_lines() | End_of_file -> [] in read_following_lines() ;; type from_uni_list = U_nil | U_single of (int * int) | U_list of (int * int) list type from_unicode = from_uni_list array;; (* A hashtable with fixed size (256). A pair (unicode, localcode) is * stored at the position unicode mod 256 in the array. *) let make_bijection unimap = (* unimap: a list of pairs (localcode, unicode) * returns a pair of arrays (m_to_unicode, m_from_unicode) with: * - m_to_unicode.(localcode) = Some unicode, * if the pair (localcode, unicode) exists * m_to_unicode.(x) = None otherwise * - m_from_unicode.(unicode lsr 8) = [ ...; (unicode,localcode); ... ] *) let m_to_unicode = Array.create 256 None in let m_from_unicode = Array.create 256 [] in List.iter (fun (localcode, unicode) -> assert(localcode < 256); (* Update m_to_unicode: *) if m_to_unicode.(localcode) <> None then failwith ("Local code point " ^ string_of_int localcode ^ " mapped twice"); m_to_unicode.(localcode) <- Some unicode; (* Update m_from_unicode: *) let unilow = unicode land 255 in if List.mem_assoc unicode (m_from_unicode.(unilow)) then failwith ("Unicode code point " ^ string_of_int unicode ^ " mapped twice"); m_from_unicode.(unilow) <- m_from_unicode.(unilow) @ [unicode,localcode]; ) unimap; m_to_unicode, m_from_unicode ;; let to_unimap_as_string to_unimap = let make_repr x = match x with None -> -1 | Some u -> u in Marshal.to_string (Array.map make_repr to_unimap) [ Marshal.No_sharing ] ;; let from_unimap_as_string from_unimap = let make_repr l = match l with [] -> U_nil | [u,l] -> U_single(u,l) | _ -> U_list l in let m = Array.map make_repr from_unimap in Marshal.to_string m [ Marshal.No_sharing ] ;; let print_bijection f name m_to_unicode m_from_unicode = (* Prints on file f this O'Caml code: * let _to_unicode = ... * let _from_unicode = ... *) fprintf f "let %s_to_unicode = lazy (Marshal.from_string \"%s\" 0 : int array);;\n" name (String.escaped (to_unimap_as_string m_to_unicode)); fprintf f "let %s_from_unicode = lazy (Marshal.from_string \"%s\" 0 : Netmappings.from_uni_list array);;\n " name (String.escaped (from_unimap_as_string m_from_unicode)); ;; let main() = let files = ref [] in let outch = ref (lazy stdout) in Arg.parse [ "-o", Arg.String (fun s -> outch := lazy (open_out s)), " Write result to this file"] (fun s -> files := !files @ [s]) "usage: unimap_to_ocaml file.unimap ..."; (* First read in all unimaps: *) let unimaps = List.map (fun filename -> let mapname = Str.replace_first (Str.regexp "\.unimap$") "" (Filename.basename filename) in let f = open_in filename in prerr_endline ("Reading " ^ filename); let unimap = read_unimap_format_a filename f in close_in f; mapname, unimap ) !files in (* Second compute all bijections: *) let bijections = List.map (fun (mapname, unimap) -> prerr_endline ("Processing " ^ mapname); let to_unicode, from_unicode = make_bijection unimap in mapname, to_unicode, from_unicode ) unimaps in let out = Lazy.force !outch in (* Third output all results: *) output_string out "(* WARNING! This is a generated file! *)\n"; List.iter (fun (mapname, to_unicode, from_unicode) -> print_bijection out mapname to_unicode from_unicode) bijections; List.iter (fun (mapname, _, _) -> fprintf out "Hashtbl.add Netmappings.to_unicode `Enc_%s %s_to_unicode;\n" mapname mapname; fprintf out "Hashtbl.add Netmappings.from_unicode `Enc_%s %s_from_unicode;\n" mapname mapname; ) (List.rev bijections); fprintf out "();;\n"; close_out out ;; main();; (* ====================================================================== * History: * * $Log$ * Revision 1.1 2000/11/17 09:57:29 lpadovan * Initial revision * * Revision 1.3 2000/08/29 00:48:52 gerd * Conversion tables are now stored in marshalled form. * New type for the conversion table Unicode to 8bit. * * Revision 1.2 2000/08/12 23:54:56 gerd * Initial revision. * * *)