1 #!/usr/bin/ocamlrun /usr/bin/ocaml
2 (* Copyright (C) 2006, HELM Team.
4 * This file is part of HELM, an Hypertextual, Electronic
5 * Library of Mathematics, developed at the Computer Science
6 * Department, University of Bologna, Italy.
8 * HELM is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 2
11 * of the License, or (at your option) any later version.
13 * HELM is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with HELM; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston,
23 * For details, see the HELM World-Wide-Web page,
24 * http://helm.cs.unibo.it/
29 with Invalid_argument _ ->
30 prerr_endline "Usage: split.ml <FILE.html>";
35 #require "pxp-engine";;
36 #require "pxp-lex-utf8";;
40 let xhtml_header title =
42 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
43 <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">
44 <html xmlns=\"http://www.w3.org/1999/xhtml\">
58 ('a Pxp_document.node #Pxp_document.extension as 'a) Pxp_document.node
61 Pxp_reader.lookup_public_id_as_file [
62 "-//W3C//DTD XHTML 1.0 Transitional//EN", "xhtml1-transitional.dtd";
63 "-//W3C//ENTITIES Latin 1 for XHTML//EN", "xhtml-lat1.ent";
64 "-//W3C//ENTITIES Symbols for XHTML//EN", "xhtml-symbol.ent";
65 "-//W3C//ENTITIES Special for XHTML//EN", "xhtml-special.ent"; ]
68 Pxp_tree_parser.parse_wfdocument_entity
69 { Pxp_types.default_config with Pxp_types.encoding = `Enc_utf8 }
70 (Pxp_types.from_file ~alt:[ resolver ] fname)
71 Pxp_tree_parser.default_spec
73 (** pattern matching like predicate on pxp nodes *)
74 let match_elt tag attr_name ?attr_value () node =
75 node#node_type = Pxp_document.T_element tag
76 && (match attr_value with
78 (try node#attribute attr_name = Pxp_types.Value attr_value
79 with Not_found -> false)
80 | None -> List.mem attr_name node#attribute_names)
85 Pxp_document.find ~deeply:true
86 (match_elt "div" "class" ~attr_value:"book" ()) doc#root
87 with Not_found -> failwith "Can't find book <div>" in
89 Pxp_document.find ~deeply:false
90 (match_elt "div" "class" ~attr_value:"titlepage" ()) document in
92 Pxp_document.find ~deeply:false
93 (match_elt "div" "class" ~attr_value:"toc" ()) document in
95 Pxp_document.find_all ~deeply:false
96 (match_elt "div" "class" ~attr_value:"chapter" ()) document in
98 Pxp_document.find ~deeply:true
99 (fun node -> node#node_type = Pxp_document.T_element "title") doc#root in
101 titlepage :: toc :: parts
103 let localize_ids parts =
104 let id2part = Hashtbl.create 1023 in
105 let part_ids = ref [] in
108 match Pxp_document.find_all ~deeply:true (match_elt "a" "id" ()) part with
110 let part_id = part_id#required_string_attribute "id" in
111 part_ids := part_id :: !part_ids;
114 let id = id#required_string_attribute "id" in
115 Hashtbl.add id2part id part_id)
117 | _ -> failwith "can't find part id")
121 let fname_of_part part_name = part_name ^ ".html"
123 let get_part_id part =
126 Pxp_document.find ~deeply:true (match_elt "a" "id" ()) part
127 with Not_found -> failwith "can't find part id" in
128 id#required_string_attribute "id"
130 let get_part_title part =
132 Pxp_document.find ~deeply:true
133 (match_elt "h2" "class" ~attr_value:"title" ()) part in
135 List.find (fun node -> node#node_type = Pxp_document.T_data) h2#sub_nodes in
138 let iter_xrefs f node =
139 let a_s = Pxp_document.find_all ~deeply:true (match_elt "a" "href" ()) node in
141 (fun (node: 'a node) ->
142 let href = node#required_string_attribute "href" in
143 assert (String.length href > 0);
144 if href.[0] = '#' then
145 let xref = String.sub href 1 (String.length href - 1) in
149 let patch_toc part_ids id2part toc =
152 if List.mem xref part_ids then
153 node#set_attribute "href" (Pxp_types.Value (fname_of_part xref))
156 let part = Hashtbl.find id2part xref in
157 node#set_attribute "href"
158 (Pxp_types.Value (fname_of_part part ^ "#" ^ xref))
159 with Not_found -> ())
162 let patch_part part_ids id2part part =
163 let part_name = get_part_id part in
167 let xref_part = Hashtbl.find id2part xref in
168 if xref_part <> part_name then
169 node#set_attribute "href"
170 (Pxp_types.Value (fname_of_part xref_part ^ "#" ^ xref))
171 with Not_found -> ())
174 let open_formatter fname =
175 Unix.open_process_out (sprintf "xmllint --format -o %s -" fname)
177 let close_formatter oc = ignore (Unix.close_process_out oc)
179 let output_index title (titlepage: 'a node) (toc: 'a node) fname =
180 let oc = open_formatter fname in
181 output_string oc (xhtml_header title);
182 titlepage#write (`Out_channel oc) `Enc_utf8;
183 toc#write (`Out_channel oc) `Enc_utf8;
184 output_string oc xhtml_trailer;
187 let output_part title (part: 'a node) fname =
188 let oc = open_formatter fname in
190 (xhtml_header (sprintf "%s - %s" title (get_part_title part)));
191 part#write (`Out_channel oc) `Enc_utf8;
192 output_string oc xhtml_trailer;
196 let doc = parse_xml fname in
198 | title, (titlepage :: toc :: parts) ->
199 let part_ids, id2part = localize_ids parts in
200 patch_toc part_ids id2part toc;
201 List.iter (patch_part part_ids id2part) parts;
202 output_index title titlepage toc "index.html";
204 (fun part -> output_part title part (get_part_id part ^ ".html"))
206 | _ -> failwith "Unrecognized document structure"