--- /dev/null
+(* $Id$
+ * ----------------------------------------------------------------------
+ *
+ *)
+
+open Printf;;
+
+let comment_re = Str.regexp "#.*$";;
+let space_re = Str.regexp "[ \t\r\n]+";;
+
+let read_unimap_format_a fname f =
+ (* Reads a Unicode mapping in format A from a "local" code to Unicode.
+ * Returns a list of pairs (localcode, unicode).
+ *)
+
+ let read_unimap_line() =
+ let s = input_line f in (* may raise End_of_file *)
+ let s' = Str.global_replace comment_re "" s in
+ let words = Str.split space_re s' in
+ match words with
+ [] -> raise Not_found
+ | [ localcode; unicode ] ->
+ int_of_string localcode, int_of_string unicode
+ | _ ->
+ failwith ("File " ^ fname ^ ": Do not know what to do with:\n" ^ s')
+ in
+
+ let rec read_following_lines() =
+ try
+ let localcode, unicode = read_unimap_line() in
+ (* may raise End_of_file, Not_found *)
+ (localcode, unicode) :: read_following_lines()
+ with
+ Not_found -> read_following_lines()
+ | End_of_file -> []
+ in
+
+ read_following_lines()
+;;
+
+
+type from_uni_list =
+ U_nil
+ | U_single of (int * int)
+ | U_list of (int * int) list
+
+type from_unicode =
+ from_uni_list array;;
+ (* A hashtable with fixed size (256). A pair (unicode, localcode) is
+ * stored at the position unicode mod 256 in the array.
+ *)
+
+
+let make_bijection unimap =
+ (* unimap: a list of pairs (localcode, unicode)
+ * returns a pair of arrays (m_to_unicode, m_from_unicode) with:
+ * - m_to_unicode.(localcode) = Some unicode,
+ * if the pair (localcode, unicode) exists
+ * m_to_unicode.(x) = None otherwise
+ * - m_from_unicode.(unicode lsr 8) = [ ...; (unicode,localcode); ... ]
+ *)
+
+ let m_to_unicode = Array.create 256 None in
+ let m_from_unicode = Array.create 256 [] in
+
+ List.iter
+ (fun (localcode, unicode) ->
+ assert(localcode < 256);
+
+ (* Update m_to_unicode: *)
+ if m_to_unicode.(localcode) <> None then
+ failwith ("Local code point " ^ string_of_int localcode ^
+ " mapped twice");
+ m_to_unicode.(localcode) <- Some unicode;
+
+ (* Update m_from_unicode: *)
+ let unilow = unicode land 255 in
+ if List.mem_assoc unicode (m_from_unicode.(unilow)) then
+ failwith ("Unicode code point " ^ string_of_int unicode ^
+ " mapped twice");
+ m_from_unicode.(unilow) <-
+ m_from_unicode.(unilow) @ [unicode,localcode];
+ )
+ unimap;
+
+ m_to_unicode, m_from_unicode
+;;
+
+
+let to_unimap_as_string to_unimap =
+ let make_repr x =
+ match x with
+ None -> -1
+ | Some u -> u
+ in
+ Marshal.to_string (Array.map make_repr to_unimap) [ Marshal.No_sharing ]
+;;
+
+
+let from_unimap_as_string from_unimap =
+ let make_repr l =
+ match l with
+ [] -> U_nil
+ | [u,l] -> U_single(u,l)
+ | _ -> U_list l
+ in
+ let m = Array.map make_repr from_unimap in
+ Marshal.to_string m [ Marshal.No_sharing ]
+;;
+
+
+let print_bijection f name m_to_unicode m_from_unicode =
+ (* Prints on file f this O'Caml code:
+ * let <name>_to_unicode = ...
+ * let <name>_from_unicode = ...
+ *)
+ fprintf f "let %s_to_unicode = lazy (Marshal.from_string \"%s\" 0 : int array);;\n"
+ name
+ (String.escaped (to_unimap_as_string m_to_unicode));
+
+ fprintf f "let %s_from_unicode = lazy (Marshal.from_string \"%s\" 0 : Netmappings.from_uni_list array);;\n "
+ name
+ (String.escaped (from_unimap_as_string m_from_unicode));
+;;
+
+
+let main() =
+ let files = ref [] in
+ let outch = ref (lazy stdout) in
+ Arg.parse
+ [ "-o", Arg.String (fun s -> outch := lazy (open_out s)),
+ " <file> Write result to this file"]
+ (fun s -> files := !files @ [s])
+ "usage: unimap_to_ocaml file.unimap ...";
+
+ (* First read in all unimaps: *)
+ let unimaps =
+ List.map
+ (fun filename ->
+ let mapname = Str.replace_first (Str.regexp "\.unimap$") ""
+ (Filename.basename filename) in
+ let f = open_in filename in
+ prerr_endline ("Reading " ^ filename);
+ let unimap = read_unimap_format_a filename f in
+ close_in f;
+ mapname, unimap
+ )
+ !files
+ in
+
+ (* Second compute all bijections: *)
+ let bijections =
+ List.map
+ (fun (mapname, unimap) ->
+ prerr_endline ("Processing " ^ mapname);
+ let to_unicode, from_unicode = make_bijection unimap in
+ mapname, to_unicode, from_unicode
+ )
+ unimaps
+ in
+
+ let out = Lazy.force !outch in
+ (* Third output all results: *)
+ output_string out "(* WARNING! This is a generated file! *)\n";
+
+ List.iter
+ (fun (mapname, to_unicode, from_unicode) ->
+ print_bijection out mapname to_unicode from_unicode)
+ bijections;
+ List.iter
+ (fun (mapname, _, _) ->
+ fprintf out "Hashtbl.add Netmappings.to_unicode `Enc_%s %s_to_unicode;\n"
+ mapname mapname;
+ fprintf out "Hashtbl.add Netmappings.from_unicode `Enc_%s %s_from_unicode;\n"
+ mapname mapname;
+ )
+ (List.rev bijections);
+ fprintf out "();;\n";
+
+ close_out out
+;;
+
+
+main();;
+
+(* ======================================================================
+ * History:
+ *
+ * $Log$
+ * Revision 1.1 2000/11/17 09:57:29 lpadovan
+ * Initial revision
+ *
+ * Revision 1.3 2000/08/29 00:48:52 gerd
+ * Conversion tables are now stored in marshalled form.
+ * New type for the conversion table Unicode to 8bit.
+ *
+ * Revision 1.2 2000/08/12 23:54:56 gerd
+ * Initial revision.
+ *
+ *
+ *)