(* * Copyright (C) 2003-2004: * Stefano Zacchiroli * for the HELM Team http://helm.cs.unibo.it/ * * This file is part of HELM, an Hypertextual, Electronic * Library of Mathematics, developed at the Computer Science * Department, University of Bologna, Italy. * * HELM is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * HELM is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HELM; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, * MA 02111-1307, USA. * * For details, see the HELM World-Wide-Web page, * http://helm.cs.unibo.it/ *) open Printf open Http_getter_debugger let trailing_dot_gz_RE = Pcre.regexp "\\.gz$" (* for g{,un}zip *) let url_RE = Pcre.regexp "^([\\w.-]+)(:(\\d+))?(/.*)?$" let http_scheme_RE = Pcre.regexp ~flags:[`CASELESS] "^http://" let file_scheme_RE = Pcre.regexp ~flags:[`CASELESS] "^file://" let dir_sep_RE = Pcre.regexp "/" let heading_slash_RE = Pcre.regexp "^/" let bufsiz = 16384 (* for file system I/O *) let tcp_bufsiz = 4096 (* for TCP I/O *) let fold_file f init fname = let ic = open_in fname in let rec aux acc = let line = try Some (input_line ic) with End_of_file -> None in match line with | None -> acc | Some line -> aux (f line acc) in let res = try aux init with e -> close_in ic; raise e in close_in ic; res let iter_file f = fold_file (fun line _ -> f line) () let hashtbl_sorted_fold f tbl init = let sorted_keys = List.sort compare (Hashtbl.fold (fun key _ keys -> key::keys) tbl []) in List.fold_left (fun acc k -> f k (Hashtbl.find tbl k) acc) init sorted_keys let hashtbl_sorted_iter f tbl = let sorted_keys = List.sort compare (Hashtbl.fold (fun key _ keys -> key::keys) tbl []) in List.iter (fun k -> f k (Hashtbl.find tbl k)) sorted_keys let cp src dst = let (ic, oc) = (open_in src, open_out dst) in let buf = String.create bufsiz in (try while true do let bytes = input ic buf 0 bufsiz in if bytes = 0 then raise End_of_file else output oc buf 0 bytes done with End_of_file -> ()); close_in ic; close_out oc let parse_url url = try let subs = Pcre.extract ~rex:url_RE (Pcre.replace ~rex:http_scheme_RE url) in (subs.(1), (if subs.(2) = "" then 80 else int_of_string subs.(3)), (if subs.(4) = "" then "/" else subs.(4))) with exc -> failwith (sprintf "Can't parse url: %s (exception: %s)" url (Printexc.to_string exc)) let init_socket addr port = let inet_addr = (Unix.gethostbyname addr).Unix.h_addr_list.(0) in let sockaddr = Unix.ADDR_INET (inet_addr, port) in let suck = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 0 in Unix.connect suck sockaddr; let outchan = Unix.out_channel_of_descr suck in let inchan = Unix.in_channel_of_descr suck in (inchan, outchan) let http_get_iter_buf ~callback url = let (address, port, path) = parse_url url in let buf = String.create tcp_bufsiz in let (inchan, outchan) = init_socket address port in output_string outchan (sprintf "GET %s\r\n" path); flush outchan; (try while true do match input inchan buf 0 tcp_bufsiz with | 0 -> raise End_of_file | bytes when bytes = tcp_bufsiz -> (* buffer full, no need to slice it *) callback buf | bytes when bytes < tcp_bufsiz -> (* buffer not full, slice it *) callback (String.sub buf 0 bytes) | _ -> (* ( bytes < 0 ) || ( bytes > tcp_bufsiz ) *) assert false done with End_of_file -> ()); close_in inchan (* close also outchan, same fd *) let wget ?output url = debug_print (sprintf "wgetting %s (output: %s)" url (match output with None -> "default" | Some f -> f)); match url with | url when Pcre.pmatch ~rex:file_scheme_RE url -> (* file:// *) (let src_fname = Pcre.replace ~rex:file_scheme_RE url in match output with | Some dst_fname -> cp src_fname dst_fname | None -> let dst_fname = Filename.basename src_fname in if src_fname <> dst_fname then cp src_fname dst_fname else (* src and dst are the same: do nothing *) ()) | url when Pcre.pmatch ~rex:http_scheme_RE url -> (* http:// *) (let oc = open_out (match output with Some f -> f | None -> Filename.basename url) in http_get_iter_buf ~callback:(fun data -> output_string oc data) url; close_out oc) | scheme -> (* unsupported scheme *) failwith ("Http_getter_misc.wget: unsupported scheme: " ^ scheme) let gzip ?(keep = false) ?output fname = let output = match output with None -> fname ^ ".gz" | Some fname -> fname in debug_print (sprintf "gzipping %s (keep: %b, output: %s)" fname keep output); let (ic, oc) = (open_in fname, Gzip.open_out output) in let buf = String.create bufsiz in (try while true do let bytes = input ic buf 0 bufsiz in if bytes = 0 then raise End_of_file else Gzip.output oc buf 0 bytes done with End_of_file -> ()); close_in ic; Gzip.close_out oc; if not keep then Sys.remove fname ;; let gunzip ?(keep = false) ?output fname = (* assumption: given file name ends with ".gz" or output is set *) let output = match output with | None -> if (Pcre.pmatch ~rex:trailing_dot_gz_RE fname) then Pcre.replace ~rex:trailing_dot_gz_RE fname else failwith "Http_getter_misc.gunzip: unable to determine output file name" | Some fname -> fname in debug_print (sprintf "gunzipping %s (keep: %b, output: %s)" fname keep output); let (ic, oc) = (Gzip.open_in fname, open_out output) in let buf = String.create bufsiz in (try while true do let bytes = Gzip.input ic buf 0 bufsiz in if bytes = 0 then raise End_of_file else Pervasives.output oc buf 0 bytes done with End_of_file -> ()); Gzip.close_in ic; close_out oc; if not keep then Sys.remove fname ;; let tempfile () = Filename.temp_file "http_getter_" "" exception Mkdir_failure of string * string;; (* dirname, failure reason *) let dir_perm = 0o755 let mkdir ?(parents = false) dirname = let mkdirhier () = let (pieces, hd) = let split = Pcre.split ~rex:dir_sep_RE dirname in if Pcre.pmatch ~rex:heading_slash_RE dirname then (List.tl split, "/") else (split, "") in ignore (List.fold_left (fun pre dir -> let next_dir = sprintf "%s%s%s" pre (match pre with "/" | "" -> "" | _ -> "/") dir in (try (match (Unix.stat next_dir).Unix.st_kind with | Unix.S_DIR -> () (* dir component already exists, go on! *) | _ -> (* dir component already exists but isn't a dir, abort! *) raise (Mkdir_failure (dirname, sprintf "'%s' already exists but is not a dir" next_dir))) with Unix.Unix_error (Unix.ENOENT, "stat", _) -> (* dir component doesn't exists, create it and go on! *) Unix.mkdir next_dir dir_perm); next_dir) hd pieces) in if parents then mkdirhier () else Unix.mkdir dirname dir_perm let string_of_proc_status = function | Unix.WEXITED code -> sprintf "[Exited: %d]" code | Unix.WSIGNALED sg -> sprintf "[Killed: %d]" sg | Unix.WSTOPPED sg -> sprintf "[Stopped: %d]" sg let http_get url = if Pcre.pmatch ~rex:file_scheme_RE url then begin (* file:// URL. Read data from file system *) let fname = Pcre.replace ~rex:file_scheme_RE url in try let size = (Unix.stat fname).Unix.st_size in let buf = String.create size in let ic = open_in fname in really_input ic buf 0 size; close_in ic; Some buf with Unix.Unix_error (Unix.ENOENT, "stat", _) -> None end else (* other URL, pass it to Http_client *) try Some (Http_client.http_get url) with e -> prerr_endline (sprintf "Warning: Http_client failed on url %s with exception: %s" url (Printexc.to_string e)); None let is_blank_line = let blank_line_RE = Pcre.regexp "(^#)|(^\\s*$)" in fun line -> Pcre.pmatch ~rex:blank_line_RE line