X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fnetstring%2Ftools%2Funimap_to_ocaml%2Funimap_to_ocaml.ml;fp=helm%2FDEVEL%2Fpxp%2Fnetstring%2Ftools%2Funimap_to_ocaml%2Funimap_to_ocaml.ml;h=14a89e9d986de436ee102905d0faab0e7279138c;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/netstring/tools/unimap_to_ocaml/unimap_to_ocaml.ml b/helm/DEVEL/pxp/netstring/tools/unimap_to_ocaml/unimap_to_ocaml.ml new file mode 100644 index 000000000..14a89e9d9 --- /dev/null +++ b/helm/DEVEL/pxp/netstring/tools/unimap_to_ocaml/unimap_to_ocaml.ml @@ -0,0 +1,201 @@ +(* $Id$ + * ---------------------------------------------------------------------- + * + *) + +open Printf;; + +let comment_re = Str.regexp "#.*$";; +let space_re = Str.regexp "[ \t\r\n]+";; + +let read_unimap_format_a fname f = + (* Reads a Unicode mapping in format A from a "local" code to Unicode. + * Returns a list of pairs (localcode, unicode). + *) + + let read_unimap_line() = + let s = input_line f in (* may raise End_of_file *) + let s' = Str.global_replace comment_re "" s in + let words = Str.split space_re s' in + match words with + [] -> raise Not_found + | [ localcode; unicode ] -> + int_of_string localcode, int_of_string unicode + | _ -> + failwith ("File " ^ fname ^ ": Do not know what to do with:\n" ^ s') + in + + let rec read_following_lines() = + try + let localcode, unicode = read_unimap_line() in + (* may raise End_of_file, Not_found *) + (localcode, unicode) :: read_following_lines() + with + Not_found -> read_following_lines() + | End_of_file -> [] + in + + read_following_lines() +;; + + +type from_uni_list = + U_nil + | U_single of (int * int) + | U_list of (int * int) list + +type from_unicode = + from_uni_list array;; + (* A hashtable with fixed size (256). A pair (unicode, localcode) is + * stored at the position unicode mod 256 in the array. + *) + + +let make_bijection unimap = + (* unimap: a list of pairs (localcode, unicode) + * returns a pair of arrays (m_to_unicode, m_from_unicode) with: + * - m_to_unicode.(localcode) = Some unicode, + * if the pair (localcode, unicode) exists + * m_to_unicode.(x) = None otherwise + * - m_from_unicode.(unicode lsr 8) = [ ...; (unicode,localcode); ... ] + *) + + let m_to_unicode = Array.create 256 None in + let m_from_unicode = Array.create 256 [] in + + List.iter + (fun (localcode, unicode) -> + assert(localcode < 256); + + (* Update m_to_unicode: *) + if m_to_unicode.(localcode) <> None then + failwith ("Local code point " ^ string_of_int localcode ^ + " mapped twice"); + m_to_unicode.(localcode) <- Some unicode; + + (* Update m_from_unicode: *) + let unilow = unicode land 255 in + if List.mem_assoc unicode (m_from_unicode.(unilow)) then + failwith ("Unicode code point " ^ string_of_int unicode ^ + " mapped twice"); + m_from_unicode.(unilow) <- + m_from_unicode.(unilow) @ [unicode,localcode]; + ) + unimap; + + m_to_unicode, m_from_unicode +;; + + +let to_unimap_as_string to_unimap = + let make_repr x = + match x with + None -> -1 + | Some u -> u + in + Marshal.to_string (Array.map make_repr to_unimap) [ Marshal.No_sharing ] +;; + + +let from_unimap_as_string from_unimap = + let make_repr l = + match l with + [] -> U_nil + | [u,l] -> U_single(u,l) + | _ -> U_list l + in + let m = Array.map make_repr from_unimap in + Marshal.to_string m [ Marshal.No_sharing ] +;; + + +let print_bijection f name m_to_unicode m_from_unicode = + (* Prints on file f this O'Caml code: + * let _to_unicode = ... + * let _from_unicode = ... + *) + fprintf f "let %s_to_unicode = lazy (Marshal.from_string \"%s\" 0 : int array);;\n" + name + (String.escaped (to_unimap_as_string m_to_unicode)); + + fprintf f "let %s_from_unicode = lazy (Marshal.from_string \"%s\" 0 : Netmappings.from_uni_list array);;\n " + name + (String.escaped (from_unimap_as_string m_from_unicode)); +;; + + +let main() = + let files = ref [] in + let outch = ref (lazy stdout) in + Arg.parse + [ "-o", Arg.String (fun s -> outch := lazy (open_out s)), + " Write result to this file"] + (fun s -> files := !files @ [s]) + "usage: unimap_to_ocaml file.unimap ..."; + + (* First read in all unimaps: *) + let unimaps = + List.map + (fun filename -> + let mapname = Str.replace_first (Str.regexp "\.unimap$") "" + (Filename.basename filename) in + let f = open_in filename in + prerr_endline ("Reading " ^ filename); + let unimap = read_unimap_format_a filename f in + close_in f; + mapname, unimap + ) + !files + in + + (* Second compute all bijections: *) + let bijections = + List.map + (fun (mapname, unimap) -> + prerr_endline ("Processing " ^ mapname); + let to_unicode, from_unicode = make_bijection unimap in + mapname, to_unicode, from_unicode + ) + unimaps + in + + let out = Lazy.force !outch in + (* Third output all results: *) + output_string out "(* WARNING! This is a generated file! *)\n"; + + List.iter + (fun (mapname, to_unicode, from_unicode) -> + print_bijection out mapname to_unicode from_unicode) + bijections; + List.iter + (fun (mapname, _, _) -> + fprintf out "Hashtbl.add Netmappings.to_unicode `Enc_%s %s_to_unicode;\n" + mapname mapname; + fprintf out "Hashtbl.add Netmappings.from_unicode `Enc_%s %s_from_unicode;\n" + mapname mapname; + ) + (List.rev bijections); + fprintf out "();;\n"; + + close_out out +;; + + +main();; + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:29 lpadovan + * Initial revision + * + * Revision 1.3 2000/08/29 00:48:52 gerd + * Conversion tables are now stored in marshalled form. + * New type for the conversion table Unicode to 8bit. + * + * Revision 1.2 2000/08/12 23:54:56 gerd + * Initial revision. + * + * + *)