From a3fb06e72407e3590fa60a74ac5fec01e6bfe1f9 Mon Sep 17 00:00:00 2001 From: Stefano Zacchiroli Date: Tue, 5 Jul 2005 08:09:24 +0000 Subject: [PATCH] new getter implementation: no more DBM maps --- helm/ocaml/getter/.depend | 28 +- helm/ocaml/getter/.ocamlinit | 3 + helm/ocaml/getter/Makefile | 4 +- helm/ocaml/getter/http_getter.ml | 718 ++++++---------------- helm/ocaml/getter/http_getter.mli | 33 +- helm/ocaml/getter/http_getter_cache.ml | 262 -------- helm/ocaml/getter/http_getter_cache.mli | 50 -- helm/ocaml/getter/http_getter_common.ml | 14 +- helm/ocaml/getter/http_getter_common.mli | 7 +- helm/ocaml/getter/http_getter_const.ml | 8 +- helm/ocaml/getter/http_getter_env.ml | 157 +---- helm/ocaml/getter/http_getter_env.mli | 83 +-- helm/ocaml/getter/http_getter_md5.ml | 42 -- helm/ocaml/getter/http_getter_md5.mli | 2 - helm/ocaml/getter/http_getter_misc.ml | 11 + helm/ocaml/getter/http_getter_misc.mli | 2 + helm/ocaml/getter/http_getter_storage.ml | 221 +++++++ helm/ocaml/getter/http_getter_storage.mli | 62 ++ helm/ocaml/getter/http_getter_types.ml | 1 + helm/ocaml/getter/http_getter_wget.ml | 24 +- helm/ocaml/getter/http_getter_wget.mli | 6 +- helm/ocaml/getter/mkindexes.pl | 40 ++ helm/ocaml/getter/sample.conf.xml | 37 ++ helm/ocaml/getter/test.ml | 7 +- helm/ocaml/getter/tree.ml | 76 --- helm/ocaml/getter/tree.mli | 12 - 26 files changed, 674 insertions(+), 1236 deletions(-) create mode 100644 helm/ocaml/getter/.ocamlinit delete mode 100644 helm/ocaml/getter/http_getter_cache.ml delete mode 100644 helm/ocaml/getter/http_getter_cache.mli delete mode 100644 helm/ocaml/getter/http_getter_md5.ml delete mode 100644 helm/ocaml/getter/http_getter_md5.mli create mode 100644 helm/ocaml/getter/http_getter_storage.ml create mode 100644 helm/ocaml/getter/http_getter_storage.mli create mode 100755 helm/ocaml/getter/mkindexes.pl create mode 100644 helm/ocaml/getter/sample.conf.xml delete mode 100644 helm/ocaml/getter/tree.ml delete mode 100644 helm/ocaml/getter/tree.mli diff --git a/helm/ocaml/getter/.depend b/helm/ocaml/getter/.depend index 141c2a88c..bddc04dd4 100644 --- a/helm/ocaml/getter/.depend +++ b/helm/ocaml/getter/.depend @@ -1,9 +1,5 @@ -http_getter_env.cmi: http_getter_types.cmo http_getter_common.cmi: http_getter_types.cmo -http_getter_cache.cmi: http_getter_types.cmo http_getter.cmi: http_getter_types.cmo -tree.cmo: tree.cmi -tree.cmx: tree.cmi http_getter_wget.cmo: http_getter_types.cmo http_getter_wget.cmi http_getter_wget.cmx: http_getter_types.cmx http_getter_wget.cmi http_getter_logger.cmo: http_getter_logger.cmi @@ -16,8 +12,10 @@ http_getter_env.cmo: http_getter_types.cmo http_getter_misc.cmi \ http_getter_logger.cmi http_getter_const.cmi http_getter_env.cmi http_getter_env.cmx: http_getter_types.cmx http_getter_misc.cmx \ http_getter_logger.cmx http_getter_const.cmx http_getter_env.cmi -http_getter_md5.cmo: http_getter_env.cmi http_getter_md5.cmi -http_getter_md5.cmx: http_getter_env.cmx http_getter_md5.cmi +http_getter_storage.cmo: http_getter_wget.cmi http_getter_types.cmo \ + http_getter_misc.cmi http_getter_env.cmi http_getter_storage.cmi +http_getter_storage.cmx: http_getter_wget.cmx http_getter_types.cmx \ + http_getter_misc.cmx http_getter_env.cmx http_getter_storage.cmi http_getter_common.cmo: http_getter_types.cmo http_getter_misc.cmi \ http_getter_logger.cmi http_getter_env.cmi http_getter_common.cmi http_getter_common.cmx: http_getter_types.cmx http_getter_misc.cmx \ @@ -26,17 +24,11 @@ http_getter_map.cmo: http_getter_types.cmo http_getter_misc.cmi \ http_getter_map.cmi http_getter_map.cmx: http_getter_types.cmx http_getter_misc.cmx \ http_getter_map.cmi -http_getter_cache.cmo: http_getter_types.cmo http_getter_misc.cmi \ - http_getter_logger.cmi http_getter_env.cmi http_getter_common.cmi \ - http_getter_cache.cmi -http_getter_cache.cmx: http_getter_types.cmx http_getter_misc.cmx \ - http_getter_logger.cmx http_getter_env.cmx http_getter_common.cmx \ - http_getter_cache.cmi -http_getter.cmo: tree.cmi http_getter_wget.cmi http_getter_types.cmo \ - http_getter_misc.cmi http_getter_md5.cmi http_getter_map.cmi \ +http_getter.cmo: http_getter_wget.cmi http_getter_types.cmo \ + http_getter_storage.cmi http_getter_misc.cmi http_getter_map.cmi \ http_getter_logger.cmi http_getter_env.cmi http_getter_const.cmi \ - http_getter_common.cmi http_getter_cache.cmi http_getter.cmi -http_getter.cmx: tree.cmx http_getter_wget.cmx http_getter_types.cmx \ - http_getter_misc.cmx http_getter_md5.cmx http_getter_map.cmx \ + http_getter_common.cmi http_getter.cmi +http_getter.cmx: http_getter_wget.cmx http_getter_types.cmx \ + http_getter_storage.cmx http_getter_misc.cmx http_getter_map.cmx \ http_getter_logger.cmx http_getter_env.cmx http_getter_const.cmx \ - http_getter_common.cmx http_getter_cache.cmx http_getter.cmi + http_getter_common.cmx http_getter.cmi diff --git a/helm/ocaml/getter/.ocamlinit b/helm/ocaml/getter/.ocamlinit new file mode 100644 index 000000000..6512190cd --- /dev/null +++ b/helm/ocaml/getter/.ocamlinit @@ -0,0 +1,3 @@ +#use "topfind";; +#require "helm-getter";; +Helm_registry.load_from "sample.conf.xml";; diff --git a/helm/ocaml/getter/Makefile b/helm/ocaml/getter/Makefile index d90461131..8732c11ff 100644 --- a/helm/ocaml/getter/Makefile +++ b/helm/ocaml/getter/Makefile @@ -6,16 +6,14 @@ REQUIRES = \ helm-xml helm-thread helm-logger helm-urimanager helm-registry INTERFACE_FILES = \ - tree.mli \ http_getter_wget.mli \ http_getter_logger.mli \ http_getter_misc.mli \ http_getter_const.mli \ http_getter_env.mli \ - http_getter_md5.mli \ + http_getter_storage.mli \ http_getter_common.mli \ http_getter_map.mli \ - http_getter_cache.mli \ http_getter.mli \ $(NULL) diff --git a/helm/ocaml/getter/http_getter.ml b/helm/ocaml/getter/http_getter.ml index 4094a7376..9254c13f6 100644 --- a/helm/ocaml/getter/http_getter.ml +++ b/helm/ocaml/getter/http_getter.ml @@ -1,29 +1,26 @@ -(* - * Copyright (C) 2003-2004: - * Stefano Zacchiroli - * for the HELM Team http://helm.cs.unibo.it/ +(* Copyright (C) 2005, HELM Team. + * + * This file is part of HELM, an Hypertextual, Electronic + * Library of Mathematics, developed at the Computer Science + * Department, University of Bologna, Italy. + * + * HELM is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * HELM is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * - * This file is part of HELM, an Hypertextual, Electronic - * Library of Mathematics, developed at the Computer Science - * Department, University of Bologna, Italy. - * - * HELM is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * HELM is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with HELM; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, - * MA 02111-1307, USA. - * - * For details, see the HELM World-Wide-Web page, - * http://helm.cs.unibo.it/ + * You should have received a copy of the GNU General Public License + * along with HELM; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + * + * For details, see the HELM World-Wide-Web page, + * http://helm.cs.unibo.it/ *) open Printf @@ -51,201 +48,47 @@ let index_sep_RE = Pcre.regexp "\r\n|\r|\n" let trailing_types_RE = Pcre.regexp "\\.types$" let heading_cic_RE = Pcre.regexp "^cic:" let heading_theory_RE = Pcre.regexp "^theory:" -let heading_nuprl_RE = Pcre.regexp "^nuprl:" -let heading_rdf_cic_RE = Pcre.regexp "^helm:rdf.*//cic:" -let heading_rdf_theory_RE = Pcre.regexp "^helm:rdf.*//theory:" +let heading_nuprl_RE = Pcre.regexp "^nuprl:" +let types_RE = Pcre.regexp "\\.types$" +let types_ann_RE = Pcre.regexp "\\.types\\.ann$" +let body_RE = Pcre.regexp "\\.body$" +let body_ann_RE = Pcre.regexp "\\.body\\.ann$" +let proof_tree_RE = Pcre.regexp "\\.proof_tree$" +let proof_tree_ann_RE = Pcre.regexp "\\.proof_tree\\.ann$" +let theory_RE = Pcre.regexp "\\.theory$" +let basepart_RE = Pcre.regexp + "^([^.]*\\.[^.]*)((\\.body)|(\\.proof_tree)|(\\.types))?(\\.ann)?$" +let slash_RE = Pcre.regexp "/" +let pipe_RE = Pcre.regexp "\\|" +let til_slash_RE = Pcre.regexp "^.*/" +let no_slashes_RE = Pcre.regexp "^[^/]*$" +let fix_regexp_RE = Pcre.regexp ("^" ^ (Pcre.quote "(cic|theory)")) +let showable_file_RE = Pcre.regexp "(\\.con|\\.ind|\\.var)$" + +let xml_suffix = ".xml" +let theory_suffix = ".theory" ^ xml_suffix (* global maps, shared by all threads *) -let cic_map = - lazy (new Http_getter_map.map (Lazy.force Http_getter_env.cic_dbm)) -let nuprl_map = - lazy (new Http_getter_map.map (Lazy.force Http_getter_env.nuprl_dbm)) -let rdf_map = - lazy (new Http_getter_map.map (Lazy.force Http_getter_env.rdf_dbm)) -let xsl_map = - lazy (new Http_getter_map.map (Lazy.force Http_getter_env.xsl_dbm)) - -let uri_tree = ref None -let deref_if_some r = - match !r with - | None -> assert false - | Some x -> x -let is_prefetch_on () = - match !uri_tree with None -> false | Some _ -> true - -let dump_tree () = - let path = Lazy.force Http_getter_env.dump_file in - Tree.save_to_disk path (deref_if_some uri_tree); - Http_getter_md5.create_hash [ - (Lazy.force Http_getter_env.cic_dbm_real); - path ] - -let load_tree () = - if not (Http_getter_md5.check_hash ()) then - assert false - else - uri_tree := Some (Tree.load_from_disk - (Lazy.force Http_getter_env.dump_file)) - -let sync_with_map () = - if not (Http_getter_md5.check_hash ()) then begin - let tree = ref (Some Tree.empty_tree) in - Http_getter_logger.log "Updating cic map dump..."; - let t = Unix.time () in - (Lazy.force cic_map)#iter - (fun k _ -> - tree := Some (Tree.add_uri k (deref_if_some tree))); - uri_tree := !tree; - Http_getter_logger.log - (sprintf "done in %.0f sec" (Unix.time () -. t)); - dump_tree () - end else begin - Http_getter_logger.log "Cic map dump is up to date!"; - load_tree () (* XXX TASSI: race condition here *) - end - -let maps = [ cic_map; nuprl_map; rdf_map; xsl_map ] -let close_maps () = List.iter (fun m -> (Lazy.force m) # close) maps -let clear_maps () = List.iter (fun m -> (Lazy.force m) # clear) maps -let sync_maps () = - List.iter (fun m -> (Lazy.force m) # sync) maps; - sync_with_map () - -let map_of_uri = function - | uri when is_cic_uri uri -> Lazy.force cic_map - | uri when is_nuprl_uri uri -> Lazy.force nuprl_map - | uri when is_rdf_uri uri -> Lazy.force rdf_map - | uri when is_xsl_uri uri -> Lazy.force xsl_map - | uri -> raise (Unresolvable_URI uri) - -let update_from_server logger server_url = (* use global maps *) - Http_getter_logger.log ("Updating information from " ^ server_url); - let xml_url_of_uri = function - (* TODO missing sanity checks on server_url, e.g. it can contains $1 *) - | uri when (Pcre.pmatch ~rex:heading_cic_RE uri) -> - Pcre.replace ~rex:heading_cic_RE ~templ:server_url uri - | uri when (Pcre.pmatch ~rex:heading_theory_RE uri) -> - Pcre.replace ~rex:heading_theory_RE ~templ:server_url uri - | uri when (Pcre.pmatch ~rex:heading_nuprl_RE uri) -> - Pcre.replace ~rex:heading_nuprl_RE ~templ:server_url uri - | uri -> raise (Invalid_URI uri) - in - let rdf_url_of_uri = function (* TODO as above *) - | uri when (Pcre.pmatch ~rex:heading_rdf_cic_RE uri) -> - Pcre.replace ~rex:heading_rdf_cic_RE ~templ:server_url uri - | uri when (Pcre.pmatch ~rex:heading_rdf_theory_RE uri) -> - Pcre.replace ~rex:heading_rdf_theory_RE ~templ:server_url uri - | uri -> raise (Invalid_URI uri) - in - logger (`T ("Processing server: " ^ server_url)); - logger `BR; - let (xml_index, rdf_index, xsl_index) = - (* TODO keeps index in memory, is better to keep them on temp files? *) - (http_get (server_url ^ "/" ^ (Lazy.force Http_getter_env.xml_index)), - http_get (server_url ^ "/" ^ (Lazy.force Http_getter_env.rdf_index)), - http_get (server_url ^ "/" ^ (Lazy.force Http_getter_env.xsl_index))) - in - if (xml_index = None && rdf_index = None && xsl_index = None) then - Http_getter_logger.log (sprintf "Warning: useless server %s" server_url); - (match xml_index with - | Some xml_index -> - logger (`T "- Updating XML db ..."); -(* logger `BR; *) - List.iter - (function - | l when is_blank_line l -> () (* skip blank and commented lines *) - | l -> - (try - (match Pcre.split ~rex:index_line_sep_RE l with - | [uri; "gz"] -> - assert (is_cic_uri uri || is_nuprl_uri uri) ; - (map_of_uri uri)#replace - uri ((xml_url_of_uri uri) ^ ".xml.gz") - | [uri] -> - assert (is_cic_uri uri || is_nuprl_uri uri) ; - (map_of_uri uri)#replace - uri ((xml_url_of_uri uri) ^ ".xml") - | _ -> - logger (`T ("Ignoring invalid line: '" ^ l)); - logger `BR) - with Invalid_URI uri -> - logger (`T ("Ignoring invalid XML URI: '" ^ l)); - logger `BR)) - (Pcre.split ~rex:index_sep_RE xml_index); (* xml_index lines *) - logger (`T "All done"); - logger `BR - | None -> ()); - (match rdf_index with - | Some rdf_index -> - logger (`T "- Updating RDF db ..."); -(* logger `BR; *) - List.iter - (fun l -> - try - (match Pcre.split ~rex:index_line_sep_RE l with - | [uri; "gz"] -> - (Lazy.force rdf_map) # replace uri - ((rdf_url_of_uri uri) ^ ".xml.gz") - | [uri] -> - (Lazy.force rdf_map) # replace uri - ((rdf_url_of_uri uri) ^ ".xml") - | _ -> - logger (`T ("Ignoring invalid line: '" ^ l)); - logger `BR) - with Invalid_URI uri -> - logger (`T ("Ignoring invalid RDF URI: '" ^ l)); - logger `BR) - (Pcre.split ~rex:index_sep_RE rdf_index); (* rdf_index lines *) - logger (`T "All done"); - logger `BR - | None -> ()); - (match xsl_index with - | Some xsl_index -> - logger (`T "- Updating XSLT db ..."); -(* logger `BR; *) - List.iter - (fun l -> (Lazy.force xsl_map) # replace l (server_url ^ "/" ^ l)) - (Pcre.split ~rex:index_sep_RE xsl_index); - logger (`T "All done"); - logger `BR - | None -> ()); - Http_getter_logger.log "done with this server" - -let update_from_all_servers logger () = (* use global maps *) - clear_maps (); - List.iter - (update_from_server logger) - (* reverse order: 1st server is the most important one *) - (List.map snd (List.rev (Http_getter_env.servers ()))); - sync_maps () - -let update_from_one_server ?(logger = fun _ -> ()) server_url = - update_from_server logger server_url - -let temp_file_of_uri uri = - let flat_string s s' c = - let cs = String.copy s in - for i = 0 to (String.length s) - 1 do - if String.contains s' s.[i] then cs.[i] <- c - done; - cs - in - let user = try Unix.getlogin () with _ -> "" in - Filename.open_temp_file (user ^ flat_string uri ".-=:;!?/&" '_') "" +let ends_with_slash s = + try + s.[String.length s - 1] = '/' + with Invalid_argument _ -> false (* should we use a remote getter or not *) let remote () = try Helm_registry.get "getter.mode" = "remote" with Helm_registry.Key_not_found _ -> false + let getter_url () = Helm_registry.get "getter.url" (* Remote interface: getter methods implemented using a remote getter *) (* *) +let getxml_remote uri = not_implemented "getxml_remote" let getxslt_remote ~patch_dtd uri = not_implemented "getxslt_remote" -let getdtd_remote ~patch_dtd uri = not_implemented "getdtd_remote" +let getdtd_remote uri = not_implemented "getdtd_remote" let clean_cache_remote () = not_implemented "clean_cache_remote" let list_servers_remote () = not_implemented "list_servers_remote" let add_server_remote ~logger ~position name = @@ -253,8 +96,8 @@ let add_server_remote ~logger ~position name = let remove_server_remote ~logger position = not_implemented "remove_server_remote" let getalluris_remote () = not_implemented "getalluris_remote" -let getallrdfuris_remote () = not_implemented "getallrdfuris_remote" let ls_remote lsuri = not_implemented "ls_remote" +let exists_remote uri = not_implemented "exists_remote" (* *) let resolve_remote uri = @@ -285,99 +128,51 @@ let resolve_remote uri = | Exception e -> raise e | Resolved url -> url -let register_remote ~uri ~url = - Http_getter_wget.send - (sprintf "%sregister?uri=%s&url=%s" (getter_url ()) uri url) - -let unregister_remote uri = - Http_getter_wget.send (sprintf "%sunregister?uri=%s" (getter_url ()) uri) - -let update_remote logger () = - let answer = Http_getter_wget.get (getter_url () ^ "update") in - logger (`T answer); - logger `BR - -let getxml_remote ~format ~patch_dtd uri = - let uri = - sprintf "%sgetxml?uri=%s&format=%s&patch_dtd=%s" - (getter_url ()) uri - (match format with `Normal -> "normal" | `Gzipped -> "gz") - (match patch_dtd with true -> "yes" | false -> "no") - in - Http_getter_wget.get_and_save_to_tmp uri - (* API *) let help () = Http_getter_const.usage_string (Http_getter_env.env_to_string ()) -let resolve uri = +let exists uri = if remote () then - resolve_remote uri + exists_remote uri else - (map_of_uri uri)#resolve uri + Http_getter_storage.exists (uri ^ xml_suffix) -let register ~uri ~url = - if remote () then - register_remote ~uri ~url - else - begin - (map_of_uri uri)#add uri url; - if is_prefetch_on () then - uri_tree := Some (Tree.add_uri uri (deref_if_some uri_tree)) - end - -let unregister uri = +let resolve uri = if remote () then - unregister_remote uri + resolve_remote uri else try - begin - (map_of_uri uri)#remove uri; - if is_prefetch_on () then - uri_tree := Some (Tree.remove_uri uri (deref_if_some uri_tree)) - end - with Key_not_found _ -> () - -let update ?(logger = fun _ -> ()) () = - if remote () then - update_remote logger () - else - update_from_all_servers logger () + Http_getter_storage.resolve (uri ^ xml_suffix) + with Http_getter_storage.Resource_not_found _ -> raise (Key_not_found uri) -let getxml ?(format = `Gzipped) ?(patch_dtd = false) uri = +let getxml uri = if remote () then - getxml_remote ~format ~patch_dtd uri + getxml_remote uri else begin - Http_getter_logger.log ~level:2 ("getxml: " ^ uri); - let url = resolve uri in - Http_getter_logger.log ~level:2 ("resolved_uri: " ^ url) ; - let (fname, outchan) = temp_file_of_uri uri in - Http_getter_cache.respond_xml ~via_http:false ~enc:format ~patch:patch_dtd - ~uri ~url outchan; - close_out outchan; - fname + try + Http_getter_storage.filename (uri ^ xml_suffix) + with Http_getter_storage.Resource_not_found _ -> raise (Key_not_found uri) end -let getxslt ?(patch_dtd = true) uri = +let getxslt ?(patch_dtd = true) uri = assert false +(* let getxslt ?(patch_dtd = true) uri = if remote () then getxslt_remote ~patch_dtd uri else begin - let url = resolve uri in let (fname, outchan) = temp_file_of_uri uri in Http_getter_cache.respond_xsl ~via_http:false ~url ~patch:patch_dtd outchan; close_out outchan; fname - end + end *) -let getdtd ?(patch_dtd = true) uri = +let getdtd uri = if remote () then - getdtd_remote ~patch_dtd uri + getdtd_remote uri else begin - let url = Lazy.force Http_getter_env.dtd_dir ^ "/" ^ uri in - let (fname, outchan) = temp_file_of_uri uri in - Http_getter_cache.respond_dtd ~via_http:false ~url ~patch:patch_dtd outchan; - close_out outchan; + let fname = Lazy.force Http_getter_env.dtd_dir ^ "/" ^ uri in + if not (Sys.file_exists fname) then raise (Dtd_not_found uri); fname end @@ -385,71 +180,7 @@ let clean_cache () = if remote () then clean_cache_remote () else - Http_getter_cache.clean () - -let list_servers () = - if remote () then - list_servers_remote () - else - Http_getter_env.servers () - -let add_server ?(logger = fun _ -> ()) ?(position = 0) name = - if remote () then - add_server_remote ~logger ~position name - else begin - if position = 0 then begin - Http_getter_env.add_server ~position:0 name; - update_from_one_server ~logger name (* quick update (new server only) *) - end else if position > 0 then begin - Http_getter_env.add_server ~position name; - update ~logger () - end else (* already checked by parse_position *) - assert false - end - -let has_server position = List.mem_assoc position (Http_getter_env.servers ()) - -let remove_server ?(logger = fun _ -> ()) position = - if remote () then - remove_server_remote ~logger () - else begin - let server_name = - try - List.assoc position (Http_getter_env.servers ()) - with Not_found -> - raise (Invalid_argument (sprintf "no server with position %d" position)) - in - Http_getter_env.remove_server position; - update ~logger () - end - -let return_uris map filter = - let uris = ref [] in - map#iter (fun uri _ -> if filter uri then uris := uri :: !uris); - List.rev !uris - -let getalluris () = - if remote () then - getalluris_remote () - else - let filter uri = - (Pcre.pmatch ~rex:heading_cic_RE uri) -(* && not (Pcre.pmatch ~rex:trailing_types_RE uri) *) - in - return_uris (Lazy.force cic_map) filter - -let getallrdfuris classs = - if remote () then - getallrdfuris_remote () - else - let filter = - let base = "^helm:rdf:www\\.cs\\.unibo\\.it/helm/rdf/" in - match classs with - | `Forward -> (fun uri -> Pcre.pmatch ~pat:(base ^ "forward") uri) - | `Backward -> (fun uri -> Pcre.pmatch ~pat:(base ^ "backward") uri) - in - return_uris (Lazy.force rdf_map) filter - + Http_getter_storage.clean_cache () let (++) (oldann, oldtypes, oldbody, oldtree) (newann, newtypes, newbody, newtree) = @@ -458,213 +189,142 @@ let (++) (oldann, oldtypes, oldbody, oldtree) (if newbody > oldbody then newbody else oldbody), (if newtree > oldtree then newtree else oldtree)) -let (types_RE, types_ann_RE, body_RE, body_ann_RE, - proof_tree_RE, proof_tree_ann_RE, trailing_slash_RE, theory_RE) = - (Pcre.regexp "\\.types$", Pcre.regexp "\\.types\\.ann$", - Pcre.regexp "\\.body$", Pcre.regexp "\\.body\\.ann$", - Pcre.regexp "\\.proof_tree$", Pcre.regexp "\\.proof_tree\\.ann$", - Pcre.regexp "/$", Pcre.regexp "\\.theory$") - -let basepart_RE = - Pcre.regexp - "^([^.]*\\.[^.]*)((\\.body)|(\\.proof_tree)|(\\.types))?(\\.ann)?$" - -let (slash_RE, til_slash_RE, no_slashes_RE) = - (Pcre.regexp "/", Pcre.regexp "^.*/", Pcre.regexp "^[^/]*$") -let fix_regexp_RE = Pcre.regexp ("^" ^ (Pcre.quote "(cic|theory)")) - -let ls regexp = - if remote () then - ls_remote regexp - else begin - let looking_for_dir = Pcre.pmatch ~rex:trailing_slash_RE regexp in - let pat = Pcre.replace ~rex:trailing_slash_RE ("^" ^ regexp) in - let (dir_RE, dir_local_RE, obj_RE, first_comp_RE) = - Pcre.regexp (pat ^ "/"), Pcre.regexp "[^/]+/[^/]*", - Pcre.regexp (pat ^ "(\\.|$)"), Pcre.regexp "/.*" - in - let exists_theory regexp = - let theory = - Pcre.replace ~rex:fix_regexp_RE ~templ:"theory" regexp ^ "index.theory" - in +let store_obj tbl o = + if Pcre.pmatch ~rex:showable_file_RE o then begin + let basepart = Pcre.replace ~rex:basepart_RE ~templ:"$1" o in + let no_flags = false, No, No, No in + let oldflags = try - ignore (resolve theory); - true - with Key_not_found _ -> false + Hashtbl.find tbl basepart + with Not_found -> (* no ann, no types, no body, no proof tree *) + no_flags in - let toplevel_theory = - match List.rev (Pcre.split ~rex:slash_RE pat) with - | dir :: _ -> Some (dir ^ ".theory") - | _ -> None + let newflags = + match o with + | s when Pcre.pmatch ~rex:types_RE s -> (false, Yes, No, No) + | s when Pcre.pmatch ~rex:types_ann_RE s -> (true, Ann, No, No) + | s when Pcre.pmatch ~rex:body_RE s -> (false, No, Yes, No) + | s when Pcre.pmatch ~rex:body_ann_RE s -> (true, No, Ann, No) + | s when Pcre.pmatch ~rex:proof_tree_RE s -> (false, No, No, Yes) + | s when Pcre.pmatch ~rex:proof_tree_ann_RE s -> (true, No, No, Ann) + | s -> no_flags in + Hashtbl.replace tbl basepart (oldflags ++ newflags) + end + +let store_dir set_ref d = + set_ref := StringSet.add (List.hd (Pcre.split ~rex:slash_RE d)) !set_ref + +let collect_ls_items dirs_set objs_tbl = + let items = ref [] in + StringSet.iter (fun dir -> items := Ls_section dir :: !items) dirs_set; + Http_getter_misc.hashtbl_sorted_iter + (fun uri (annflag, typesflag, bodyflag, treeflag) -> + items := + Ls_object { + uri = uri; ann = annflag; + types = typesflag; body = bodyflag; proof_tree = treeflag + } :: !items) + objs_tbl; + List.rev !items + + (** non regexp-aware version of ls *) +let dumb_ls uri_prefix = + if is_cic_obj_uri uri_prefix then begin let dirs = ref StringSet.empty in let objs = Hashtbl.create 17 in - let store_dir d = - dirs := StringSet.add (List.hd (Pcre.split ~rex:slash_RE d)) !dirs - in - let store_obj o = - let basepart = Pcre.replace ~rex:basepart_RE ~templ:"$1" o in - let no_flags = false, No, No, No in - let oldflags = - try - Hashtbl.find objs basepart - with Not_found -> (* no ann, no types, no body, no proof tree *) - no_flags - in - let newflags = - match o with - | s when Pcre.pmatch ~rex:types_RE s -> (false, Yes, No, No) - | s when Pcre.pmatch ~rex:types_ann_RE s -> (true, Ann, No, No) - | s when Pcre.pmatch ~rex:body_RE s -> (false, No, Yes, No) - | s when Pcre.pmatch ~rex:body_ann_RE s -> (true, No, Ann, No) - | s when Pcre.pmatch ~rex:proof_tree_RE s -> (false, No, No, Yes) - | s when Pcre.pmatch ~rex:proof_tree_ann_RE s -> (true, No, No, Ann) - | s -> no_flags - in - Hashtbl.replace objs basepart (oldflags ++ newflags) - in - (* Variables used in backward compatibility code to map - theory:/path/t.theory into theory:/path/t/index.theory - when cic:/path/t/ exists *) - let the_candidate_for_remapping = - (* CSC: Here I am making a strong assumption: the pattern - can be only of the form [^:]*:/path where path is - NOT a regular expression *) - "theory:" ^ Pcre.replace ~rex:(Pcre.regexp "[^:]*:") pat + List.iter + (fun fname -> + if ends_with_slash fname then + store_dir dirs fname + else + try + store_obj objs (strip_suffix ~suffix:xml_suffix fname) + with Invalid_argument _ -> ()) + (Http_getter_storage.ls uri_prefix); + collect_ls_items !dirs objs + end else if is_theory_uri uri_prefix then begin + let items = ref [] in + List.iter + (fun fname -> + if ends_with_slash fname then + items := Ls_section (strip_trailing_slash fname) :: !items + else + try + let obj = + { uri = strip_suffix theory_suffix fname; + ann = false; types = No; body = No; proof_tree = No } + in + items := Ls_object obj :: !items + with Invalid_argument _ -> ()) + (Http_getter_storage.ls uri_prefix); + !items + end else + raise (Invalid_URI uri_prefix) + + (* handle simple regular expressions of the form "...(..|..|..)..." on cic + * uris, not meant to be a real implementation of regexp. The only we use is + * "(cic|theory):/..." *) +let explode_ls_regexp regexp = + try + let len = String.length regexp in + let lparen_idx = String.index regexp '(' in + let rparen_idx = String.index_from regexp lparen_idx ')' in + let choices_str = (* substring between parens, parens excluded *) + String.sub regexp (lparen_idx + 1) (rparen_idx - lparen_idx - 1) in - let index_not_generated_yet = ref true in - let valid_candidates = ref [] in - let candidates_found = ref [] in - - (*(Lazy.force cic_map) # iter*) + let choices = Pcre.split ~rex:pipe_RE choices_str in + let prefix = String.sub regexp 0 lparen_idx in + let suffix = String.sub regexp (rparen_idx + 1) (len - (rparen_idx + 1)) in + List.map (fun choice -> prefix ^ choice ^ suffix) choices + with Not_found -> [regexp] + +let merge_results results = + let rec aux objects_acc dirs_acc = function + | [] -> dirs_acc @ objects_acc + | Ls_object _ as obj :: tl -> aux (obj :: objects_acc) dirs_acc tl + | Ls_section _ as dir :: tl -> + if List.mem dir dirs_acc then (* filters out dir duplicates *) + aux objects_acc dirs_acc tl + else + aux objects_acc (dir :: dirs_acc) tl + in + aux [] [] (List.concat results) - (* depending on prefetch *) - let if_prefetch if_is if_not = - if is_prefetch_on() then if_is else if_not - in - - let iter_on_right_thing = if_prefetch - (fun f -> List.iter (fun k -> f k "") - (Tree.ls_path regexp (deref_if_some uri_tree))) - (fun f -> (Lazy.force cic_map) # iter f) - in - let calculate_localpart = if_prefetch - (fun uri -> uri) - (fun uri -> Pcre.replace ~rex:dir_RE uri) - in - let check_if_x_RE = if_prefetch - (fun x_RE uri -> true) - (fun x_RE uri -> Pcre.pmatch ~rex:x_RE uri) - in - let check_if_dir_RE = check_if_x_RE dir_RE in - let check_if_obj_RE = check_if_x_RE obj_RE in - - iter_on_right_thing - (fun key _ -> - (* we work in two ways: - * 1 iteration on the whole map - * 2 tree visit - * - * Since in the first case 'key' is a complete uri, while - * in the second case it is only the subtree rooted in the - * query regex, we must relay only on the localpath. - * - * example: - * query::= cic:/aaa/bbb/ - * - * answer1 ::= the whole map - * - * aswer2 ::= [ "ccc/"; "c1.something"] where - * cic:/aaa/bbb/ccc/ and cic:/aaa/bbb/c1.something - * are the (partials) uri that matched query - * - * after the localpath extracion we have more uris in the first case, - * but at least the are all rooted in the same node. - * - * the Tree.get_frontier may be changed to return the same stuff as - * the map iteration+localpath extraction does, but I hope it is not - * necessary - *) - match key with - | uri when looking_for_dir && check_if_dir_RE uri -> - (* directory hit *) - let localpart = calculate_localpart uri in - if Pcre.pmatch ~rex:no_slashes_RE localpart then - begin - (* Backward compatibility code to map - theory:/path/t.theory into theory:/path/t/index.theory - when cic:/path/t/ exists *) - if Pcre.pmatch ~rex:theory_RE localpart then - candidates_found := localpart :: !candidates_found - else - store_obj localpart - end - else - begin - store_dir localpart ; - if Pcre.pmatch localpart ~rex:dir_local_RE then - begin - let valid = - Pcre.replace ~rex:first_comp_RE localpart ^ ".theory" - in - if not (List.mem valid !valid_candidates) then - valid_candidates := valid::!valid_candidates - end - end - | uri when (not looking_for_dir) && check_if_obj_RE uri -> - (* file hit *) - store_obj (Pcre.replace ~rex:til_slash_RE uri) - | uri -> ()); -(* - (* miss *) - if !index_not_generated_yet && - Pcre.pmatch ~rex:orig_theory_RE uri - then - (index_not_generated_yet := false ; - store_obj "index.theory")); - *) - if exists_theory regexp then store_obj "index.theory"; - List.iter - (fun localpart -> - if not (List.mem localpart !valid_candidates) then - store_obj localpart - ) !candidates_found ; - let ls_items = ref [] in - StringSet.iter (fun dir -> ls_items := Ls_section dir :: !ls_items) !dirs; - Http_getter_misc.hashtbl_sorted_iter - (fun uri (annflag, typesflag, bodyflag, treeflag) -> - ls_items := - Ls_object { - uri = uri; ann = annflag; - types = typesflag; body = bodyflag; proof_tree = treeflag - } :: !ls_items) - objs; - List.rev !ls_items - end +let ls regexp = + if remote () then + ls_remote regexp + else + let prefixes = explode_ls_regexp regexp in + merge_results (List.map dumb_ls prefixes) + +let getalluris () = + let rec aux acc = function + | [] -> acc + | dir :: todo -> + let acc', todo' = + List.fold_left + (fun (acc, subdirs) result -> + match result with + | Ls_object obj -> (dir ^ obj.uri) :: acc, subdirs + | Ls_section sect -> acc, (dir ^ sect ^ "/") :: subdirs) + (acc, todo) + (dumb_ls dir) + in + aux acc' todo' + in + aux [] ["cic:/"] (* trailing slash required *) (* Shorthands from now on *) let getxml' uri = getxml (UriManager.string_of_uri uri) let resolve' uri = resolve (UriManager.string_of_uri uri) -let register' uri url = register ~uri:(UriManager.string_of_uri uri) ~url -let unregister' uri = unregister (UriManager.string_of_uri uri) +let exists' uri = exists (UriManager.string_of_uri uri) -let sync_dump_file () = - if is_prefetch_on () then - dump_tree () - let init () = Http_getter_logger.set_log_level (Helm_registry.get_opt_default Helm_registry.int ~default:1 "getter.log_level"); Http_getter_logger.set_log_file - (Helm_registry.get_opt Helm_registry.string "getter.log_file"); - Http_getter_env.reload (); - let is_prefetch_set = - Helm_registry.get_opt_default Helm_registry.bool ~default:false - "getter.prefetch" - in - if is_prefetch_set then - (* ignore (Thread.create sync_with_map ()) *) - sync_with_map () + (Helm_registry.get_opt Helm_registry.string "getter.log_file") diff --git a/helm/ocaml/getter/http_getter.mli b/helm/ocaml/getter/http_getter.mli index 0b86e730d..3f56ae3da 100644 --- a/helm/ocaml/getter/http_getter.mli +++ b/helm/ocaml/getter/http_getter.mli @@ -42,42 +42,25 @@ val help: unit -> string * @raise Http_getter_types.Key_not_found _ *) val resolve: string -> string (* uri -> url *) - (** @raise Http_getter_types.Key_already_in _ *) -val register: uri:string -> url:string -> unit +val exists: string -> bool - (** does not fail if given uri does not exist *) -val unregister: string -> unit - -val update: ?logger:logger_callback -> unit -> unit - - (** @param format defaults to `Gzipped - * @param patch_dtd defaults to false *) -val getxml : ?format:encoding -> ?patch_dtd:bool -> string -> string +val getxml : string -> string val getxslt : ?patch_dtd:bool -> string -> string -val getdtd : ?patch_dtd:bool -> string -> string +val getdtd : string -> string val clean_cache: unit -> unit -val list_servers: unit -> (int * string) list -val add_server: ?logger:logger_callback -> ?position:int -> string -> unit -val remove_server: ?logger:logger_callback -> int -> unit val getalluris: unit -> string list -val getallrdfuris: [ `Forward | `Backward ] -> string list - (** @param regexp regular expression (PCRE syntax) over HELM URIs *) + (** @param baseuri uri to be listed, simple form or regular expressions (a + * single choice among parens) are permitted *) val ls: string -> ls_item list - (** {2 Shorthands} *) + (** {2 UriManager shorthands} *) -val getxml' : UriManager.uri -> string (* `Gzipped format, no DTD patch *) +val getxml' : UriManager.uri -> string val resolve' : UriManager.uri -> string -val register' : UriManager.uri -> string -> unit -val unregister' : UriManager.uri -> unit +val exists' : UriManager.uri -> bool (** {2 Misc} *) -val close_maps: unit -> unit -val update_from_one_server: ?logger:logger_callback -> string -> unit -val has_server: int -> bool (* does a server with a given position exists? *) val init: unit -> unit - (** cal this at exit() *) -val sync_dump_file: unit -> unit diff --git a/helm/ocaml/getter/http_getter_cache.ml b/helm/ocaml/getter/http_getter_cache.ml deleted file mode 100644 index e6627fa1e..000000000 --- a/helm/ocaml/getter/http_getter_cache.ml +++ /dev/null @@ -1,262 +0,0 @@ -(* - * Copyright (C) 2003-2004: - * Stefano Zacchiroli - * for the HELM Team http://helm.cs.unibo.it/ - * - * This file is part of HELM, an Hypertextual, Electronic - * Library of Mathematics, developed at the Computer Science - * Department, University of Bologna, Italy. - * - * HELM is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * HELM is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with HELM; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, - * MA 02111-1307, USA. - * - * For details, see the HELM World-Wide-Web page, - * http://helm.cs.unibo.it/ - *) - -(* TODO cache expires control!!! *) -(* TODO uwobo loop: - if two large proof (already in cache) are requested at the same time by two - clients, uwobo (java implementation, not yet tested with the OCaml one) - starts looping sending output to one of the client *) - -open Http_getter_common -open Http_getter_misc -open Http_getter_types -open Printf - - (* expose ThreadSafe.threadSafe methods *) -class threadSafe = - object - inherit ThreadSafe.threadSafe - method virtual doCritical : 'a. 'a lazy_t -> 'a - method virtual doReader : 'a. 'a lazy_t -> 'a - method virtual doWriter : 'a. 'a lazy_t -> 'a - end - -let threadSafe = new threadSafe - -let finally cleanup f = - try - let res = Lazy.force f in - cleanup (); - res - with e -> - cleanup (); - raise (Http_getter_types.Cache_failure (Printexc.to_string e)) - -let resource_type_of_url = function - | url when Pcre.pmatch ~pat:"\\.xml\\.gz$" url -> `Gzipped - | url when Pcre.pmatch ~pat:"\\.xml$" url -> `Normal - | url -> raise (Invalid_URL url) - -let extension_of_resource_type = function - | `Normal -> "xml" - | `Gzipped -> "xml.gz" - - (* basename = resource name without trailing ".gz", if any *) -let is_in_cache basename = - Sys.file_exists - (match Lazy.force Http_getter_env.cache_mode with - | `Normal -> basename - | `Gzipped -> basename ^ ".gz") - -let respond_xml - ?(via_http = true) ?(enc = `Normal) ?(patch = true) ~url ~uri outchan - = - let local_part = Http_getter_misc.local_url url in - let local_resource = local_part <> None in - let resource_type = resource_type_of_url url in - let extension = extension_of_resource_type resource_type in - let downloadname = - match uri_of_string uri with (* parse uri *) - | Cic_uri (Cic baseuri) | Cic_uri (Theory baseuri) -> - (* assumption: baseuri starts with "/" *) - sprintf "%s%s.%s" (Lazy.force Http_getter_env.cic_dir) baseuri extension - | Nuprl_uri baseuri -> - (* assumption: baseuri starts with "/" *) - sprintf "%s%s.%s" (Lazy.force Http_getter_env.nuprl_dir) baseuri - extension - | Rdf_uri (prefix, ((Cic baseuri) as qbaseuri)) - | Rdf_uri (prefix, ((Theory baseuri) as qbaseuri)) -> - let escaped_prefix = - (Pcre.replace ~pat:"/" ~templ:"_" - (Pcre.replace ~pat:"_" ~templ:"__" - (prefix ^ - (match qbaseuri with - | Cic _ -> "//cic:" - | Theory _ -> "//theory:")))) - in - sprintf "%s/%s%s.%s" - (Lazy.force Http_getter_env.rdf_dir) escaped_prefix baseuri extension - in - let patch_fun = - let xmlbases = - if Http_getter_common.is_theory_uri uri then - Some (Filename.dirname uri, Filename.dirname url) - else - None - in - if patch then - Some (Http_getter_common.patch_xml ?xmlbases ~via_http ()) - else - None - in - let basename = Pcre.replace ~pat:"\\.gz$" downloadname in - let contype = "text/xml" in - (* Fill cache if needed and return a short circuit file. - Short circuit is needed in situations like: - resource type = normal, cache type = gzipped, required encoding = normal - we would like to avoid normal -> gzipped -> normal conversions. To avoid - this tmp_short_circuit is used to remember the name of the intermediate - file name *) - let fill_cache () = - threadSafe#doWriter (lazy( - if local_resource then begin (* resource available via file system *) - Http_getter_logger.log ~level:2 "Local resource, avoid caching"; - None - end else if not (is_in_cache basename) then begin (* cache miss *) - Http_getter_logger.log ~level:2 "Cache MISS :-("; - mkdir ~parents:true (Filename.dirname downloadname); - match (resource_type, Lazy.force Http_getter_env.cache_mode) with - | `Normal, `Normal | `Gzipped, `Gzipped -> - wget ~output:downloadname url; - None - | `Normal, `Gzipped -> (* resource normal, cache gzipped *) - let tmp = tempfile () in - let (res, cleanup) = - if enc = `Normal then (* user wants normal: don't delete it! *) - (Some (tmp, enc), (fun () -> ())) - else - (None, (fun () -> Sys.remove tmp)) - in - finally cleanup (lazy ( - wget ~output:tmp url; - gzip ~output:(basename ^ ".gz") ~keep:true tmp; (* fill cache *) - res - )); - | `Gzipped, `Normal -> (* resource gzipped, cache normal *) - let tmp = tempfile () in - let (res, cleanup) = - if enc = `Gzipped then (* user wants .gz: don't delete it! *) - (Some (tmp, enc), (fun () -> ())) - else - (None, (fun () -> Sys.remove tmp)) - in - finally cleanup (lazy ( - wget ~output:tmp url; - gunzip ~output:basename ~keep:true tmp; (* fill cache *) - res - )); - end else begin (* cache hit *) - Http_getter_logger.log ~level:2 "Cache HIT :-)"; - None - end - )) in - let tmp_short_circuit = fill_cache () in - threadSafe#doReader (lazy( - assert (local_resource || is_in_cache basename); - let basename = match local_part with Some f -> f | None -> basename in - let resource_type = - if local_resource then - resource_type - else - Lazy.force Http_getter_env.cache_mode - in - match (enc, resource_type) with - | `Normal, `Normal | `Gzipped, `Gzipped -> - (* resource (in cache or local) is already in the required format *) - (match enc with - | `Normal -> - Http_getter_logger.log ~level:2 - "No format mangling required (encoding = normal)"; - return_file ~via_http ~fname:basename ~contype ?patch_fun ~enc - outchan - | `Gzipped -> - Http_getter_logger.log ~level:2 - "No format mangling required (encoding = gzipped)"; - return_file - ~via_http ~fname:(basename ^ ".gz") ~contype ~contenc:"x-gzip" - ?patch_fun ~gunzip:true ~enc - outchan) - | `Normal, `Gzipped | `Gzipped, `Normal -> - (match tmp_short_circuit with - | None -> (* no short circuit possible, use cache *) - Http_getter_logger.log ~level:2 - "No short circuit available, use cache (or local resource)"; - let tmp = tempfile () in - finally (fun () -> Sys.remove tmp) (lazy ( - (match enc with - | `Normal -> - (* required format normal, cached (or local) version gzipped *) - gunzip (* gunzip to tmp *) - ~output:tmp ~keep:true (basename ^ ".gz"); - return_file ~via_http ~fname:tmp ~contype ?patch_fun ~enc outchan - | `Gzipped -> - (* required format is gzipped, cached version is normal *) - gzip ~output:tmp ~keep:true basename; (* gzip to tmp *) - return_file - ~via_http ~fname:tmp ~contype ~contenc:"x-gzip" - ?patch_fun ~gunzip:true ~enc - outchan) - )) - | Some (fname, `Normal) -> (* short circuit available, use it! *) - Http_getter_logger.log ~level:2 - "Using short circuit (encoding = normal)"; - finally (fun () -> Sys.remove fname) (lazy ( - return_file ~via_http ~fname ~contype ?patch_fun ~enc outchan - )) - | Some (fname, `Gzipped) -> (* short circuit available, use it! *) - Http_getter_logger.log ~level:2 - "Using short circuit (encoding = gzipped)"; - finally (fun () -> Sys.remove fname) (lazy ( - return_file ~via_http ~fname ~contype ~contenc:"x-gzip" ?patch_fun - ~gunzip:true ~enc outchan - ))) - )) - - (* TODO enc is not yet supported *) -let respond_xsl - ?(via_http = true) ?(enc = `Normal) ?(patch = true) ~url outchan - = - let patch_fun = - if patch then Http_getter_common.patch_xsl ~via_http () else (fun x -> x) - in - let fname = tempfile () in - finally (fun () -> Sys.remove fname) (lazy ( - wget ~output:fname url; - return_file ~via_http ~fname ~contype:"text/xml" ~patch_fun ~enc outchan - )) - - (* TODO enc is not yet supported *) -let respond_dtd - ?(via_http = true) ?(enc = `Normal) ?(patch = true) ~url outchan - = - let patch_fun = - if patch then Http_getter_common.patch_dtd ~via_http () else (fun x -> x) - in - if Sys.file_exists url then - (* TODO check this: old getter here used text/xml *) - return_file ~via_http ~fname:url ~contype:"text/plain" ~patch_fun ~enc - outchan - else - raise (Dtd_not_found url) - -let clean () = - let module E = Http_getter_env in - List.iter - (function dir -> ignore (Unix.system ("rm -rf " ^ dir ^ "/*"))) - [ Lazy.force E.cic_dir; Lazy.force E.nuprl_dir; Lazy.force E.rdf_dir ] - diff --git a/helm/ocaml/getter/http_getter_cache.mli b/helm/ocaml/getter/http_getter_cache.mli deleted file mode 100644 index 7789a023b..000000000 --- a/helm/ocaml/getter/http_getter_cache.mli +++ /dev/null @@ -1,50 +0,0 @@ -(* - * Copyright (C) 2003-2004: - * Stefano Zacchiroli - * for the HELM Team http://helm.cs.unibo.it/ - * - * This file is part of HELM, an Hypertextual, Electronic - * Library of Mathematics, developed at the Computer Science - * Department, University of Bologna, Italy. - * - * HELM is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * HELM is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with HELM; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, - * MA 02111-1307, USA. - * - * For details, see the HELM World-Wide-Web page, - * http://helm.cs.unibo.it/ - *) - -open Http_getter_types;; - -(** all these methods could raise Http_getter_types.Cache_failure. - * @param via_http (default: true) @see Http_getter_common.return_file *) - -val respond_xml: - ?via_http:bool -> ?enc:encoding -> ?patch:bool -> url:string -> uri:string -> - out_channel -> - unit - -val respond_xsl: - ?via_http:bool -> ?enc:encoding -> ?patch:bool -> url:string -> - out_channel -> - unit - -val respond_dtd: - ?via_http:bool -> ?enc:encoding -> ?patch:bool -> url:string -> - out_channel -> - unit - -val clean: unit -> unit - diff --git a/helm/ocaml/getter/http_getter_common.ml b/helm/ocaml/getter/http_getter_common.ml index c85f680c2..d56cf6909 100644 --- a/helm/ocaml/getter/http_getter_common.ml +++ b/helm/ocaml/getter/http_getter_common.ml @@ -115,14 +115,14 @@ let return_file ~fname ?contype ?contenc ?patch_fun ?(gunzip = false) ?(via_http = true) ~enc outchan = - let headers = - match (contype, contenc) with - | (Some t, Some e) -> ["Content-Encoding", e; "Content-Type", t] - | (Some t, None) -> ["Content-Type" , t] - | (None, Some e) -> ["Content-Encoding", e] - | (None, None) -> [] - in if via_http then begin + let headers = + match (contype, contenc) with + | (Some t, Some e) -> ["Content-Encoding", e; "Content-Type", t] + | (Some t, None) -> ["Content-Type" , t] + | (None, Some e) -> ["Content-Encoding", e] + | (None, None) -> [] + in Http_daemon.send_basic_headers ~code:(`Code 200) outchan; Http_daemon.send_headers headers outchan; Http_daemon.send_CRLF outchan diff --git a/helm/ocaml/getter/http_getter_common.mli b/helm/ocaml/getter/http_getter_common.mli index e94e39314..d1bc66f76 100644 --- a/helm/ocaml/getter/http_getter_common.mli +++ b/helm/ocaml/getter/http_getter_common.mli @@ -32,6 +32,7 @@ val string_of_ls_flag: ls_flag -> string val string_of_encoding: encoding -> string val is_cic_uri: string -> bool +val is_cic_obj_uri: string -> bool val is_theory_uri: string -> bool val is_nuprl_uri: string -> bool val is_rdf_uri: string -> bool @@ -41,10 +42,10 @@ val uri_of_string: string -> uri (** @param xmlbases (xml base URI * xml base URL) *) val patch_xml : - ?via_http:bool -> ?xmlbases:(string * string) -> unit -> string -> string -val patch_dtd : ?via_http:bool -> unit -> string -> string + ?via_http:bool -> ?xmlbases:(string * string) -> unit -> (string -> string) +val patch_dtd : ?via_http:bool -> unit -> (string -> string) (* TODO via_http not yet supported for patch_xsl *) -val patch_xsl : ?via_http:bool -> unit -> string -> string +val patch_xsl : ?via_http:bool -> unit -> (string -> string) (** @param fname name of the file to be sent diff --git a/helm/ocaml/getter/http_getter_const.ml b/helm/ocaml/getter/http_getter_const.ml index f12d20f77..b9b0ce7a7 100644 --- a/helm/ocaml/getter/http_getter_const.ml +++ b/helm/ocaml/getter/http_getter_const.ml @@ -28,7 +28,7 @@ open Printf;; -let version = "0.3.0" +let version = "0.4.0" let conffile = "http_getter.conf.xml" let xhtml_ns = "http://www.w3.org/1999/xhtml" @@ -88,12 +88,6 @@ let usage_string configuration =

clean_cache

-

- getalluris?format=(txt|xml)
-

-

- getallrdfuris
-

ls?baseuri=regexp&format=(txt|xml)

diff --git a/helm/ocaml/getter/http_getter_env.ml b/helm/ocaml/getter/http_getter_env.ml index 1fa159a3c..a73710632 100644 --- a/helm/ocaml/getter/http_getter_env.ml +++ b/helm/ocaml/getter/http_getter_env.ml @@ -29,33 +29,15 @@ open Printf open Http_getter_types +open Http_getter_misc let version = Http_getter_const.version -let servers_file = lazy ( - Helm_registry.get_opt Helm_registry.string "getter.servers_file") -let cic_dbm = lazy (Helm_registry.get "getter.maps_dir" ^ "/cic_db") -let cic_dbm_real = lazy (Helm_registry.get "getter.maps_dir" ^ "/cic_db.pag") -let nuprl_dbm = lazy (Helm_registry.get "getter.maps_dir" ^ "/nuprl_db") -let rdf_dbm = lazy (Helm_registry.get "getter.maps_dir" ^ "/rdf_db") -let xsl_dbm = lazy (Helm_registry.get "getter.maps_dir" ^ "/xsl_db") -let dump_file = lazy (Helm_registry.get "getter.maps_dir" ^ - "/cic_db_tree.dump") -let prefetch = lazy (Helm_registry.get_bool "getter.prefetch") -let xml_index = lazy ( - Helm_registry.get_opt_default Helm_registry.string ~default:"index.txt" - "getter.xml_indexname") -let rdf_index = lazy ( - Helm_registry.get_opt_default Helm_registry.string ~default:"rdf_index.txt" - "getter.rdf_indexname") -let xsl_index = lazy ( - Helm_registry.get_opt_default Helm_registry.string ~default:"xslt_index.txt" - "getter.xsl_indexname") -let cic_dir = lazy (Helm_registry.get "getter.cache_dir" ^ "/cic") -let nuprl_dir = lazy (Helm_registry.get "getter.cache_dir" ^ "/nuprl") -let rdf_dir = lazy (Helm_registry.get "getter.cache_dir" ^ "/rdf") -let dtd_dir = lazy (Helm_registry.get "getter.dtd_dir") -let dtd_base_urls = lazy ( +let blanks_RE = Pcre.regexp "\\s" + +let cache_dir = lazy (normalize_dir (Helm_registry.get "getter.cache_dir")) +let dtd_dir = lazy (normalize_dir (Helm_registry.get "getter.dtd_dir")) +let dtd_base_urls = lazy ( let rex = Pcre.regexp "/*$" in let raw_urls = match @@ -68,40 +50,16 @@ let dtd_base_urls = lazy ( let port = lazy ( Helm_registry.get_opt_default Helm_registry.int ~default:58081 "getter.port") -let _servers = ref None - -let servers = - function () -> - (match !_servers with - | None -> failwith "Getter not yet initialized: servers not available" - | Some servers -> servers) - -let load_servers () = - let pos = ref ~-1 in - match Lazy.force servers_file with - | None -> - List.map (fun s -> incr pos; (!pos, s)) - (Helm_registry.get_list Helm_registry.string "getter.servers") - | Some servers_file -> - List.rev (Http_getter_misc.fold_file - (fun line servers -> - if Http_getter_misc.is_blank_line line then - servers - else - (incr pos; (!pos, line) :: servers)) - [] - servers_file) - -let reload_servers () = _servers := Some (load_servers ()) - -let save_servers () = - match Lazy.force servers_file with - | None -> () - | Some servers_file -> - let oc = open_out servers_file in - List.iter (fun (_,server) -> output_string oc (server ^ "\n")) - (servers ()); - close_out oc +let prefixes = lazy ( + let prefixes = Helm_registry.get_list Helm_registry.string "getter.prefix" in + List.fold_left + (fun acc prefix -> + match Pcre.split ~rex:blanks_RE prefix with + | [uri_prefix; url_prefix] -> (uri_prefix, url_prefix) :: acc + | _ -> + Http_getter_logger.log ("skipping invalid prefix: " ^ prefix); + acc) + [] prefixes) let host = lazy @@ -115,88 +73,29 @@ let my_own_url = sprintf "http://%s%s" (* without trailing '/' *) host (if port = 80 then "" else (sprintf ":%d" port))) -let cache_mode = - lazy - (let mode_string = - Helm_registry.get_opt_default Helm_registry.string ~default:"gz" - "getter.cache_mode" - in - match String.lowercase mode_string with - | "normal" -> `Normal - | "gz" -> `Gzipped - | mode -> failwith ("Invalid cache mode: " ^ mode)) - -let reload () = reload_servers () - let env_to_string () = + let pp_prefix (uri_prefix, url_prefix) = uri_prefix ^ " -> " ^ url_prefix in + let pp_prefixes prefixes = + match prefixes with + | [] -> "" + | l -> "\n" ^ String.concat "\n" (List.map pp_prefix l) ^ "\n" + in sprintf -"HTTP Getter %s (the OCaml one!) +"HTTP Getter %s -cic_dbm:\t%s -nuprl_dbm:\t%s -rdf_dbm:\t%s -xsl_dbm:\t%s -xml_index:\t%s -rdf_index:\t%s -xsl_index:\t%s -cic_dir:\t%s -nuprl_dir:\t%s -rdf_dir:\t%s +prefixes:%s dtd_dir:\t%s -dump_file:\t%s -prefetch:\t%b -servers_file:\t%s host:\t\t%s port:\t\t%d my_own_url:\t%s dtd_base_urls:\t%s -cache_mode:\t%s -servers: -\t%s log_file:\t%s log_level:\t%d " - version (Lazy.force cic_dbm) (Lazy.force nuprl_dbm) (Lazy.force rdf_dbm) - (Lazy.force xsl_dbm) (Lazy.force xml_index) - (Lazy.force rdf_index) (Lazy.force xsl_index) (Lazy.force cic_dir) - (Lazy.force nuprl_dir) (Lazy.force rdf_dir) - (Lazy.force dtd_dir) (Lazy.force dump_file) (Lazy.force prefetch) - (match Lazy.force servers_file with - | None -> "no servers file" - | Some servers_file -> servers_file) - (Lazy.force host) - (Lazy.force port) (Lazy.force my_own_url) - (String.concat " " (Lazy.force dtd_base_urls)) - (match Lazy.force cache_mode with - | `Normal -> "Normal" - | `Gzipped -> "GZipped") - (String.concat "\n\t" (* (position * server) list *) - (List.map (fun (pos, server) -> sprintf "%3d: %s" pos server) - (servers ()))) + version + (pp_prefixes (Lazy.force prefixes)) + (Lazy.force dtd_dir) (Lazy.force host) (Lazy.force port) + (Lazy.force my_own_url) (String.concat " " (Lazy.force dtd_base_urls)) (match Http_getter_logger.get_log_file () with None -> "None" | Some f -> f) (Http_getter_logger.get_log_level ()) -let add_server ?position url = - let new_servers = - let servers = servers () in - match position with - | None -> servers @ [-1, url]; - | Some p when p > 0 -> - let rec add_after pos = function - | [] -> [-1, url] - | hd :: tl when p = 1 -> hd :: (-1, url) :: tl - | hd :: tl (* when p > 1 *) -> hd :: (add_after (pos - 1) tl) - in - add_after p servers - | Some 0 -> (-1, url)::servers - | Some _ -> assert false - in - _servers := Some new_servers; - save_servers (); - reload_servers () - -let remove_server position = - _servers := Some (List.remove_assoc position (servers ())); - save_servers (); - reload_servers () - diff --git a/helm/ocaml/getter/http_getter_env.mli b/helm/ocaml/getter/http_getter_env.mli index 7d2d1632e..fa90d354a 100644 --- a/helm/ocaml/getter/http_getter_env.mli +++ b/helm/ocaml/getter/http_getter_env.mli @@ -1,77 +1,48 @@ -(* - * Copyright (C) 2003-2004: - * Stefano Zacchiroli - * for the HELM Team http://helm.cs.unibo.it/ +(* Copyright (C) 2005, HELM Team. + * + * This file is part of HELM, an Hypertextual, Electronic + * Library of Mathematics, developed at the Computer Science + * Department, University of Bologna, Italy. + * + * HELM is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * HELM is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * - * This file is part of HELM, an Hypertextual, Electronic - * Library of Mathematics, developed at the Computer Science - * Department, University of Bologna, Italy. - * - * HELM is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * HELM is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with HELM; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, - * MA 02111-1307, USA. - * - * For details, see the HELM World-Wide-Web page, - * http://helm.cs.unibo.it/ + * You should have received a copy of the GNU General Public License + * along with HELM; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + * + * For details, see the HELM World-Wide-Web page, + * http://helm.cs.unibo.it/ *) -open Http_getter_types;; - - (* {2 general information} *) + (** {2 general information} *) val version : string (* getter version *) - (* {2 environment gathered data} *) + (** {2 environment gathered data} *) + (** all *_dir values are returned with trailing "/" *) -val cic_dbm : string lazy_t (* XML map DBM file for CIC *) -val cic_dbm_real : string lazy_t (* XML map DBM file for CIC ^ ".pag"*) -val nuprl_dbm : string lazy_t (* XML map DBM file for NuPRL *) -val rdf_dbm : string lazy_t (* RDF map DBM file *) -val xsl_dbm : string lazy_t (* XSL map DBM file *) -val dump_file : string lazy_t (* CIC DBM tree dump file *) -val prefetch : bool lazy_t (* prefetch URIs tree? *) -val xml_index : string lazy_t (* XMLs' index *) -val rdf_index : string lazy_t (* RDFs' index *) -val xsl_index : string lazy_t (* XSLTs' index *) -val cic_dir : string lazy_t (* XMLs' directory for CIC*) -val nuprl_dir : string lazy_t (* XMLs' directory for NuPRL*) -val rdf_dir : string lazy_t (* RDFs' directory *) +val cache_dir : string lazy_t (* cache root *) val dtd_dir : string lazy_t (* DTDs' root directory *) -val servers_file : string option lazy_t (* servers.txt file *) val port : int lazy_t (* port on which getter listens *) val dtd_base_urls : string list lazy_t (* base URLs for document patching *) +val prefixes : (string * string) list lazy_t (* prefix map uri -> url *) (* {2 derived data} *) val host : string lazy_t (* host on which getter listens *) val my_own_url : string lazy_t (* URL at which contact getter *) -val servers : unit -> (int * string) list - (* (position * server) list *) -val cache_mode : encoding lazy_t (* cached files encoding *) - - (* {2 dynamic configuration changes} *) - - (* add a server to servers list in a given position (defaults to "after the - last server", change servers file accordingly and reload servers list *) -val add_server: ?position:int -> string -> unit - (* remove a server from servers list, change servers file accordingly and - reload servers list *) -val remove_server: int -> unit (* {2 misc} *) -val reload: unit -> unit (* reload configuration information *) val env_to_string : unit -> string (* dump a textual representation of the current http_getter settings on an output channel *) diff --git a/helm/ocaml/getter/http_getter_md5.ml b/helm/ocaml/getter/http_getter_md5.ml deleted file mode 100644 index ea9116fde..000000000 --- a/helm/ocaml/getter/http_getter_md5.ml +++ /dev/null @@ -1,42 +0,0 @@ -(* md5 helpers *) -let md5_create files reg = - let opts = "md5sum " ^ String.concat " " files ^ " > " ^ reg in - let rc = Unix.system opts in - match rc with - | Unix.WEXITED 0 -> () - | Unix.WEXITED i -> raise (Failure "Unable to create the md5sums file") - | _ -> assert false - -let md5_check reg = - let opts = "md5sum -c " ^ reg in - let rc = Unix.system opts in - match rc with - | Unix.WEXITED 0 -> true - | Unix.WEXITED _ -> false - | _ -> assert false - -let reg () = (Lazy.force Http_getter_env.dump_file) ^ ".md5" - -(* sync checks *) - -(* maybe should be useda as a fallback *) -(* -let is_in_sync_date () = - let get_last_mod_date f = - try - (Unix.stat f).Unix.st_mtime - with - | Unix.Unix_error (Unix.ENOENT, _, _)-> 0.0 - in - let map_date = get_last_mod_date (Lazy.force Http_getter_env.cic_dbm_real) in - let dump_date = get_last_mod_date (Lazy.force Http_getter_env.dump_file) in - dump_date < map_date -*) - -let check_hash () = - md5_check (reg ()) - -let create_hash files = - md5_create files (reg ()) - - diff --git a/helm/ocaml/getter/http_getter_md5.mli b/helm/ocaml/getter/http_getter_md5.mli deleted file mode 100644 index c787843d4..000000000 --- a/helm/ocaml/getter/http_getter_md5.mli +++ /dev/null @@ -1,2 +0,0 @@ -val check_hash: unit -> bool -val create_hash: string list -> unit diff --git a/helm/ocaml/getter/http_getter_misc.ml b/helm/ocaml/getter/http_getter_misc.ml index 0430c1b48..85b87cb93 100644 --- a/helm/ocaml/getter/http_getter_misc.ml +++ b/helm/ocaml/getter/http_getter_misc.ml @@ -294,3 +294,14 @@ let extension s = String.sub s idx (String.length s - idx) with Not_found -> "" +let temp_file_of_uri uri = + let flat_string s s' c = + let cs = String.copy s in + for i = 0 to (String.length s) - 1 do + if String.contains s' s.[i] then cs.[i] <- c + done; + cs + in + let user = try Unix.getlogin () with _ -> "" in + Filename.open_temp_file (user ^ flat_string uri ".-=:;!?/&" '_') "" + diff --git a/helm/ocaml/getter/http_getter_misc.mli b/helm/ocaml/getter/http_getter_misc.mli index 0e0a3471f..e9086e8d1 100644 --- a/helm/ocaml/getter/http_getter_misc.mli +++ b/helm/ocaml/getter/http_getter_misc.mli @@ -95,3 +95,5 @@ val strip_suffix: suffix:string -> string -> string val extension: string -> string (** @return string part after rightmost "." *) +val temp_file_of_uri: string -> string * out_channel + diff --git a/helm/ocaml/getter/http_getter_storage.ml b/helm/ocaml/getter/http_getter_storage.ml new file mode 100644 index 000000000..8266c9611 --- /dev/null +++ b/helm/ocaml/getter/http_getter_storage.ml @@ -0,0 +1,221 @@ +(* Copyright (C) 2004-2005, HELM Team. + * + * This file is part of HELM, an Hypertextual, Electronic + * Library of Mathematics, developed at the Computer Science + * Department, University of Bologna, Italy. + * + * HELM is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * HELM is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with HELM; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + * + * For details, see the HELM World-Wide-Web page, + * http://helm.cs.unibo.it/ + *) + +open Printf + +open Http_getter_misc +open Http_getter_types + +exception Not_found' +exception Resource_not_found of string (** uri *) + +let index_fname = "INDEX" + +let trailing_slash_RE = Pcre.regexp "/$" +let relative_RE_raw = "(^[^/]+(/[^/]+)*/?$)" +let relative_RE = Pcre.regexp relative_RE_raw +let file_scheme_RE_raw = "(^file://)" +let extended_file_scheme_RE = Pcre.regexp "(^file:/+)" +let file_scheme_RE = Pcre.regexp (relative_RE_raw ^ "|" ^ file_scheme_RE_raw) +let http_scheme_RE = Pcre.regexp "^http://" +let newline_RE = Pcre.regexp "\\n" +let cic_scheme_sep_RE = Pcre.regexp ":/" +let gz_suffix = ".gz" + +let path_of_file_url url = + assert (Pcre.pmatch ~rex:file_scheme_RE url); + if Pcre.pmatch ~rex:relative_RE url then + url + else (* absolute path, add heading "/" if missing *) + "/" ^ (Pcre.replace ~rex:extended_file_scheme_RE url) + + (** associative list regular expressions -> url prefixes + * sorted with longest prefixes first *) +let prefix_map = lazy ( + let map_w_length = + List.map + (fun (uri_prefix, url_prefix) -> + let uri_prefix = normalize_dir uri_prefix in + let url_prefix = normalize_dir url_prefix in + let regexp = Pcre.regexp ("^(" ^ Pcre.quote uri_prefix ^ ")") in + (regexp, String.length uri_prefix, uri_prefix, url_prefix)) + (Lazy.force Http_getter_env.prefixes) + in + let decreasing_length (_, len1, _, _) (_, len2, _, _) = compare len2 len1 in + List.map + (fun (regexp, len, uri_prefix, url_prefix) -> + (regexp, strip_trailing_slash uri_prefix, url_prefix)) + (List.fast_sort decreasing_length map_w_length)) + +let resolve_prefix uri = + let matches = + List.filter (fun (rex, _, _) -> Pcre.pmatch ~rex uri) + (Lazy.force prefix_map) + in + match matches with + | (rex, _, url_prefix) :: _ -> Pcre.replace_first ~rex ~templ:url_prefix uri + | [] -> raise (Unresolvable_URI uri) + +let exists_http _ url = + Http_getter_wget.exists (url ^ gz_suffix) || Http_getter_wget.exists url + +let exists_file _ fname = + Sys.file_exists (fname ^ gz_suffix) || Sys.file_exists fname + +let resolve_http _ url = + try + List.find Http_getter_wget.exists [ url ^ gz_suffix; url ] + with Not_found -> raise Not_found' + +let resolve_file _ fname = + try + List.find Sys.file_exists [ fname ^ gz_suffix; fname ] + with Not_found -> raise Not_found' + +let strip_gz_suffix fname = + if extension fname = ".gz" then + String.sub fname 0 (String.length fname - 3) + else + fname + +let remove_duplicates l = + Http_getter_misc.list_uniq (List.fast_sort Pervasives.compare l) + +let ls_file_single _ path_prefix = + let is_dir fname = (Unix.stat fname).Unix.st_kind = Unix.S_DIR in + let is_useless dir = try dir.[0] = '.' with _ -> false in + let entries = ref [] in + try + let dir_handle = Unix.opendir path_prefix in + (try + while true do + let entry = Unix.readdir dir_handle in + if is_useless entry then + () + else if is_dir (path_prefix ^ "/" ^ entry) then + entries := normalize_dir entry :: !entries + else + entries := strip_gz_suffix entry :: !entries + done + with End_of_file -> Unix.closedir dir_handle); + remove_duplicates !entries + with Unix.Unix_error (_, "opendir", _) -> [] + +let ls_http_single _ url_prefix = + let index = Http_getter_wget.get (normalize_dir url_prefix ^ index_fname) in + Pcre.split ~rex:newline_RE index + +let get_file _ path = + if Sys.file_exists (path ^ gz_suffix) then + path ^ gz_suffix + else if Sys.file_exists path then + path + else + raise Not_found' + +let get_http uri url = + let scheme, path = + match Pcre.split ~rex:cic_scheme_sep_RE uri with + | [scheme; path] -> scheme, path + | _ -> assert false + in + let cache_dest = + sprintf "%s%s/%s" (Lazy.force Http_getter_env.cache_dir) scheme path + in + if not (Sys.file_exists cache_dest) then begin (* fill cache *) + Http_getter_misc.mkdir ~parents:true (Filename.dirname cache_dest); + (try + Http_getter_wget.get_and_save (url ^ gz_suffix) (cache_dest ^ gz_suffix) + with Http_user_agent.Http_error _ -> + (try + Http_getter_wget.get_and_save url cache_dest + with Http_user_agent.Http_error _ -> + raise Not_found')) + end; + cache_dest + +let remove_file _ path = + if Sys.file_exists (path ^ gz_suffix) then Sys.remove (path ^ gz_suffix); + if Sys.file_exists path then Sys.remove path + +let remove_http _ _ = + prerr_endline "Http_getter_storage.remove: not implemented for HTTP scheme"; + assert false + +type 'a storage_method = { + name: string; + file: string -> string -> 'a; (* unresolved uri, resolved uri *) + http: string -> string -> 'a; (* unresolved uri, resolved uri *) +} + +let dispatch storage_method uri = + assert (extension uri <> ".gz"); + let uri = (* add trailing slash to roots *) + try + if uri.[String.length uri - 1] = ':' then uri ^ "/" + else uri + with Invalid_argument _ -> uri + in + let url = resolve_prefix uri in + try + if Pcre.pmatch ~rex:file_scheme_RE url then + storage_method.file uri (path_of_file_url url) + else if Pcre.pmatch ~rex:http_scheme_RE url then + storage_method.http uri url + else + raise (Unsupported_scheme url) + with Not_found' -> raise (Resource_not_found uri) + +let exists = + dispatch { name = "exists"; file = exists_file; http = exists_http } +let resolve = + dispatch { name = "resolve"; file = resolve_file; http = resolve_http } +let ls_single = + dispatch { name = "ls"; file = ls_file_single; http = ls_http_single } +let get = dispatch { name = "get"; file = get_file; http = get_http } +let remove = + dispatch { name = "remove"; file = remove_file; http = remove_http } + +let filename = get + + (* ls_single performs ls only below a single prefix, but prefixes which have + * common prefix (sorry) with a given one may need to be considered as well + * for example: when doing "ls cic:/" we would like to see the "cic:/matita" + * directory *) +let ls uri_prefix = + let direct_results = ls_single uri_prefix in + List.fold_left + (fun results (_, uri_prefix', _) -> + if Filename.dirname uri_prefix' = strip_trailing_slash uri_prefix then + (Filename.basename uri_prefix' ^ "/") :: results + else + results) + direct_results + (Lazy.force prefix_map) + +let clean_cache () = + ignore (Sys.command + (sprintf "rm -rf %s/" (Lazy.force Http_getter_env.cache_dir))) + diff --git a/helm/ocaml/getter/http_getter_storage.mli b/helm/ocaml/getter/http_getter_storage.mli new file mode 100644 index 000000000..396510af8 --- /dev/null +++ b/helm/ocaml/getter/http_getter_storage.mli @@ -0,0 +1,62 @@ +(* Copyright (C) 2004-2005, HELM Team. + * + * This file is part of HELM, an Hypertextual, Electronic + * Library of Mathematics, developed at the Computer Science + * Department, University of Bologna, Italy. + * + * HELM is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * HELM is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with HELM; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, + * MA 02111-1307, USA. + * + * For details, see the HELM World-Wide-Web page, + * http://helm.cs.unibo.it/ + *) + +(** Transparent handling of local/remote getter resources. + * Configuration of this module are prefix mappings (see + * Http_getter_env.prefixes). All functions of this module take as input an URI, + * resolve it using mappings and act on the resulting resource which can be + * local (file:/// scheme or relative path) or remote via HTTP (http:// scheme). + * + * Each resource could be either compressed (trailing ".gz") or non-compressed. + * All functions of this module will first loook for the compressed resource + * (i.e. the asked one ^ ".gz"), falling back to the non-compressed one. + * + * All filenames returned by functions of this module exists on the filesystem + * after function's return. + * + * Almost all functions may raise Resource_not_found, the following invariant + * holds: that exception is raised iff exists return false on a given resource + * *) + +exception Resource_not_found of string (** uri *) + + (** @return a list of string where dir are returned with a trailing "/" *) +val ls: string -> string list + + + (** @return the filename of the resource corresponding to a given uri. Handle + * download and caching for remote resources. *) +val filename: string -> string + + (** only works for local resources + * if both compressed and non-compressed versions of a resource exist, both of + * them are removed *) +val remove: string -> unit + +val exists: string -> bool +val resolve: string -> string + +val clean_cache: unit -> unit + diff --git a/helm/ocaml/getter/http_getter_types.ml b/helm/ocaml/getter/http_getter_types.ml index 1574e2bdb..172550652 100644 --- a/helm/ocaml/getter/http_getter_types.ml +++ b/helm/ocaml/getter/http_getter_types.ml @@ -37,6 +37,7 @@ exception Dtd_not_found of string (* dtd's url *) exception Key_already_in of string;; exception Key_not_found of string;; exception Http_client_error of string * string (* url, error message *) +exception Unsupported_scheme of string (** unsupported url scheme *) type encoding = [ `Normal | `Gzipped ] type answer_format = [ `Text | `Xml ] diff --git a/helm/ocaml/getter/http_getter_wget.ml b/helm/ocaml/getter/http_getter_wget.ml index 99a9f01e7..3171e664f 100644 --- a/helm/ocaml/getter/http_getter_wget.ml +++ b/helm/ocaml/getter/http_getter_wget.ml @@ -30,28 +30,34 @@ let send cmd = ignore (Http_user_agent.get cmd) with e -> raise (Http_client_error (cmd, Printexc.to_string e)) -let get uri = +let get url = try - Http_user_agent.get uri - with e -> raise (Http_client_error (Printexc.to_string e, uri)) + Http_user_agent.get url + with e -> raise (Http_client_error (Printexc.to_string e, url)) -let get_and_save uri dest_filename = +let get_and_save url dest_filename = let out_channel = open_out dest_filename in - Http_user_agent.get_iter (output_string out_channel) uri; + Http_user_agent.get_iter (output_string out_channel) url; close_out out_channel -let get_and_save_to_tmp uri = +let get_and_save_to_tmp url = let flat_string s s' c = let cs = String.copy s in for i = 0 to (String.length s) - 1 do if String.contains s' s.[i] then cs.[i] <- c - done ; + done; cs in let user = try Unix.getlogin () with _ -> "" in let tmp_file = - Filename.temp_file (user ^ flat_string uri ".-=:;!?/&" '_') "" + Filename.temp_file (user ^ flat_string url ".-=:;!?/&" '_') "" in - get_and_save uri tmp_file ; + get_and_save url tmp_file; tmp_file +let exists url = + try + ignore (Http_user_agent.head url); + true + with Http_user_agent.Http_error _ -> false + diff --git a/helm/ocaml/getter/http_getter_wget.mli b/helm/ocaml/getter/http_getter_wget.mli index 4c978314e..5d28df185 100644 --- a/helm/ocaml/getter/http_getter_wget.mli +++ b/helm/ocaml/getter/http_getter_wget.mli @@ -23,9 +23,13 @@ * http://cs.unibo.it/helm/. *) -val send: string -> unit + (** try to guess if an HTTP resource exists using HEAD request + * @return true if HEAD response code = 200 *) +val exists: string -> bool val get: string -> string val get_and_save: string -> string -> unit val get_and_save_to_tmp: string -> string +val send: string -> unit + diff --git a/helm/ocaml/getter/mkindexes.pl b/helm/ocaml/getter/mkindexes.pl new file mode 100755 index 000000000..3107846aa --- /dev/null +++ b/helm/ocaml/getter/mkindexes.pl @@ -0,0 +1,40 @@ +#!/usr/bin/perl -w +# To be invoked in a directory where a tree of XML files of the HELM library is +# rooted. This script will then creates INDEX files in all directories of the +# tree. +use strict; +my $index_fname = "INDEX"; +sub getcwd() { + my $pwd = `pwd`; + chomp $pwd; + return $pwd; +} +sub add_trailing_slash($) { + my ($dir) = @_; + return $dir if ($dir =~ /\/$/); + return "$dir/"; +} +sub indexable($) { + my ($fname) = @_; + return 1 if ($fname =~ /\.(ind|types|body|var|theory).xml/); + return 0; +} +my @todo = (getcwd()); +while (my $dir = shift @todo) { + print "$dir\n"; + chdir $dir or die "Can't chdir to $dir\n"; + open LS, 'ls | sed \'s/\\.gz//\' | sort | uniq |'; + open INDEX, "> $index_fname" + or die "Can't open $index_fname in " . getcwd() . "\n"; + while (my $entry = ) { + chomp $entry; + if (-d $entry) { + print INDEX add_trailing_slash($entry) . "\n"; + push @todo, getcwd() . "/$entry"; + } else { + print INDEX "$entry\n" if indexable($entry); + } + } + close INDEX; + close LS; +} diff --git a/helm/ocaml/getter/sample.conf.xml b/helm/ocaml/getter/sample.conf.xml new file mode 100644 index 000000000..52f383fec --- /dev/null +++ b/helm/ocaml/getter/sample.conf.xml @@ -0,0 +1,37 @@ + +
+ luser +
+
+ /projects/helm/var/servers.txt + true + /tmp/helm/cache + /projects/helm/var + /projects/helm/xml/dtd + 58081 + 180 + /projects/helm/daemons/log/http_getter.log + + cic:/ + file:///projects/helm/library/coq_contribs/ + + + cic:/matita/ + file:///home/zacchiro/helm/matita/.matita/xml/matita/ + + + theory:/ + file:///projects/helm/library/theories/ + + +
+
diff --git a/helm/ocaml/getter/test.ml b/helm/ocaml/getter/test.ml index ab5e3812b..a35ed4d5b 100644 --- a/helm/ocaml/getter/test.ml +++ b/helm/ocaml/getter/test.ml @@ -1,14 +1,11 @@ let _ = Helm_registry.load_from "foo.conf.xml" -let fname = - Http_getter.getxml ~format:`Normal ~patch_dtd:true Sys.argv.(1) -in +let fname = Http_getter.getxml ~format:`Normal ~patch_dtd:true Sys.argv.(1) in let ic = open_in fname in (try while true do let line = input_line ic in print_endline line done -with End_of_file -> ()); -Sys.remove fname +with End_of_file -> ()) diff --git a/helm/ocaml/getter/tree.ml b/helm/ocaml/getter/tree.ml deleted file mode 100644 index 6b2fc6fa6..000000000 --- a/helm/ocaml/getter/tree.ml +++ /dev/null @@ -1,76 +0,0 @@ -(* to avoid the need of -rectypes *) -type tree = Foo of (string * tree) list - -let rec add_path t l = - match l with - | [] -> t (* no more path to add *) - | name::tl -> add_path_aux t name tl - -and add_path_aux t name tl = - match t with - | Foo [] -> Foo [(name, add_path (Foo []) tl)] - | Foo ((n, t')::bro) when n = name -> Foo ((n, (add_path t' tl))::bro) - | Foo (((n, t') as x)::bro) -> - let tmp = add_path_aux (Foo bro) name tl in - match tmp with Foo y -> Foo (x::y) - -let rec get_frontier t l = - match l with - | [] -> (match t with Foo bla -> - List.map (function (x,Foo []) -> x | (x,_) -> (x^"/")) bla) - | name::tl -> get_frontier_aux t name tl - -and get_frontier_aux (t:tree) name tl = - match t with - | Foo [] -> [] - | Foo ((n, t')::bro) when Pcre.pmatch ~pat:("^" ^ name ^ "$") n -> - (* since regex are no more "unique" matches, we have to continue - * searching on the brothers. - *) - get_frontier t' tl @ get_frontier_aux (Foo bro) name tl - | Foo (_::bro) -> get_frontier_aux (Foo bro) name tl - -let rec remove_path t path = - match path with - | [] -> t - | name::tl -> remove_path_aux t name tl - -and remove_path_aux t name tl = - match t with - | Foo [] -> assert false - | Foo ((n, t')::bro) when n = name -> - let tmp = remove_path t' tl in - (match tmp with - | Foo [] -> Foo bro - | Foo x -> Foo ((n, Foo x)::bro)) - | Foo (((n, t') as x)::bro) -> - let tmp = remove_path_aux (Foo bro) name tl in - match tmp with Foo y -> Foo (x::y) - -let split_RE = Pcre.regexp "/" - -let empty_tree = Foo [] - -let add_uri suri t = - let s = (Pcre.split ~rex:split_RE suri) in - add_path t s - -let ls_path path t = - let s = (Pcre.split ~rex:split_RE path) in - get_frontier t s - -let remove_uri suri t = - let s = (Pcre.split ~rex:split_RE suri) in - remove_path t s - -let save_to_disk path t = - let o = open_out path in - Marshal.to_channel o t []; - close_out o - -let load_from_disk path = - let i = open_in path in - let t = Marshal.from_channel i in - close_in i; - t - diff --git a/helm/ocaml/getter/tree.mli b/helm/ocaml/getter/tree.mli deleted file mode 100644 index 76118a142..000000000 --- a/helm/ocaml/getter/tree.mli +++ /dev/null @@ -1,12 +0,0 @@ -type tree - -val empty_tree: tree - -val add_uri: string -> tree -> tree -val remove_uri: string -> tree -> tree - -val ls_path: string -> tree -> string list - -val save_to_disk: string -> tree -> unit -val load_from_disk: string -> tree - -- 2.39.2