From c59f43de7aaacf1901c68dcfbcac6fe759759a29 Mon Sep 17 00:00:00 2001 From: Claudio Sacerdoti Coen Date: Wed, 27 Jun 2001 13:11:05 +0000 Subject: [PATCH] Repository created. --- helm/metadata/create/Makefile | 73 +++ helm/metadata/create/fix_rdf.pl | 26 + helm/metadata/create/mkindex.sh | 4 + helm/metadata/create/mywget.pl | 707 ++++++++++++++++++++++ helm/metadata/create/split.pl | 26 + helm/metadata/create/uris_of_filenames.pl | 14 + 6 files changed, 850 insertions(+) create mode 100644 helm/metadata/create/Makefile create mode 100755 helm/metadata/create/fix_rdf.pl create mode 100755 helm/metadata/create/mkindex.sh create mode 100755 helm/metadata/create/mywget.pl create mode 100755 helm/metadata/create/split.pl create mode 100755 helm/metadata/create/uris_of_filenames.pl diff --git a/helm/metadata/create/Makefile b/helm/metadata/create/Makefile new file mode 100644 index 000000000..ac242589d --- /dev/null +++ b/helm/metadata/create/Makefile @@ -0,0 +1,73 @@ +marcello_THEORIES_OK = \ + cic:/Coq/Arith/ \ + cic:/Coq/Bool/ \ + cic:/Coq/Init/ \ +\ + cic:/Coq/IntMap/Adalloc/ \ + cic:/Coq/IntMap/Addec/ \ + cic:/Coq/IntMap/Addr/ \ + cic:/Coq/IntMap/Adist/ \ + cic:/Coq/IntMap/Fset/ \ + cic:/Coq/IntMap/Lsort/ \ + cic:/Coq/IntMap/Map/ \ + cic:/Coq/IntMap/Mapaxioms/ \ + cic:/Coq/IntMap/Mapc/ \ + cic:/Coq/IntMap/Mapcanon/ \ + cic:/Coq/IntMap/Mapcard/ \ + cic:/Coq/IntMap/Mapfold/ \ + cic:/Coq/IntMap/Mapiter/ \ + cic:/Coq/IntMap/Maplists/ \ + cic:/Coq/IntMap/Mapsubset/ \ +\ + cic:/Coq/Lists/ \ + cic:/Coq/Logic/ \ +\ + cic:/Coq/Reals/R_Ifp/ \ + cic:/Coq/Reals/Raxioms/ \ + cic:/Coq/Reals/Rbase/ \ + cic:/Coq/Reals/Rbasic_fun/ \ + cic:/Coq/Reals/Rdefinitions/ \ + cic:/Coq/Reals/Rderiv/ \ + cic:/Coq/Reals/Rfunctions/ \ + cic:/Coq/Reals/Rlimit/ \ + cic:/Coq/Reals/Rseries/ \ + cic:/Coq/Reals/Rsyntax/ \ + cic:/Coq/Reals/Rtrigo_fun/ \ + cic:/Coq/Reals/TypeSyntax/ \ +\ + cic:/Coq/Relations/ \ + cic:/Coq/Sets/ \ + cic:/Coq/Wellfounded/ \ +\ + cic:/Coq/ZArith/Wf_Z/ \ + cic:/Coq/ZArith/ZArith_dec/ \ + cic:/Coq/ZArith/Zmisc/ \ + cic:/Coq/ZArith/Zsyntax/ \ + cic:/Coq/ZArith/auxiliary/ \ + cic:/Coq/ZArith/fast_integer/ \ + cic:/Coq/ZArith/zarith_aux/ \ +\ + cic:/Coq/correctness/ \ + cic:/Coq/field/ \ + cic:/Coq/fourier/ \ +\ + cic:/Coq/omega/Zcomplements/ \ + cic:/Coq/omega/Zlogarithm/ \ + cic:/Coq/omega/Zpower/ \ +\ + cic:/Coq/ring/ \ + +marcello: + time ./mywget.pl $($@_THEORIES_OK) + +rdf: + ./split.pl output/* + find rdf -type f -exec ./fix_rdf.pl {} \; + (cd rdf ; ../mkindex.sh) + +.PHONY: clean rdf clean-rdf +clean: + rm -f output/* + +clean-rdf: + rm -rf rdf/* diff --git a/helm/metadata/create/fix_rdf.pl b/helm/metadata/create/fix_rdf.pl new file mode 100755 index 000000000..070136bc0 --- /dev/null +++ b/helm/metadata/create/fix_rdf.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl + +$filename = $uri = $ARGV[0]; +$outputfile = $filename.".xml"; +$uri =~ s/^rdf/cic:/; +$uri =~ s/(.*),([^,]*),([^,]*)/$1#xpointer(1\/$2\/$3)/; +$uri =~ s/(.*),([^,]*)/$1#xpointer(1\/$2)/; + +print "Now processing file $filename\n"; + +open(HEADER,">>$outputfile"); +print HEADER < + +EOT +close(HEADER); + +system("cat $filename >> $outputfile"); + +open(FOOTER,">>$outputfile"); +print FOOTER < +EOT +close(FOOTER); + +unlink $filename; diff --git a/helm/metadata/create/mkindex.sh b/helm/metadata/create/mkindex.sh new file mode 100755 index 000000000..8c52a9276 --- /dev/null +++ b/helm/metadata/create/mkindex.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +echo `find . -name "*.xml"` | ../uris_of_filenames.pl > rdf_index.txt +echo `find . -name "*.xml.gz"` | ../uris_of_filenames.pl -gz >> rdf_index.txt diff --git a/helm/metadata/create/mywget.pl b/helm/metadata/create/mywget.pl new file mode 100755 index 000000000..5b3c8d611 --- /dev/null +++ b/helm/metadata/create/mywget.pl @@ -0,0 +1,707 @@ +#!/usr/bin/perl + +# Copyright (C) 2000, HELM Team. +# +# This file is part of HELM, an Hypertextual, Electronic +# Library of Mathematics, developed at the Computer Science +# Department, University of Bologna, Italy. +# +# HELM is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# HELM is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with HELM; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# For details, see the HELM World-Wide-Web page, +# http://cs.unibo.it/helm/. + +#use strict; + +use HTTP::Daemon; +use HTTP::Status; +use HTTP::Request; +use LWP::UserAgent; +use DB_File; +use Compress::Zlib; +use URI::Escape; + +foreach $i (@ARGV) { + my $filename = $i; + $filename =~ s/\//./g; + open(FD,">output/$filename"); + +# my $url = "http://phd.cs.unibo.it:8080/helm/servlet/uwobo/apply?keys=pfi%2Creorder&xmluri=http%3A//phd.cs.unibo.it%3A8081/ls%3Fformat%3Dxml%26baseuri%3D$i¶m.uri=$i¶m.getterURL=http://phd.cs.unibo.it:8081/"; + +my $url = "http://dotto.cs.unibo.it:8080/helm/servlet/uwobo/apply?keys=pfi%2Creorder&xmluri=http%3A//dotto.cs.unibo.it%3A8081/ls%3Fformat%3Dxml%26baseuri%3D$i¶m.uri=$i¶m.getterURL=http://dotto.cs.unibo.it:8081/"; + + + print "Now processing $i...\n"; + print "$url\n"; + + my $time = time(); + my $ua = LWP::UserAgent->new; + my $request = HTTP::Request->new(GET => "$url"); + my $response = $ua->request($request, \&callback2); + $time = time() - $time; + print "Finished. Time elapsed: $time\n\n"; + + close(FD); +} + +exit; + +sub callback2 +{ + my ($data) = @_; + print FD $data; +} + +################# + + +#CSC: mancano i controlli sulle condizioni di errore di molte funzioni +#CSC: ==> non e' robusto +#CSC: altra roba da sistemare segnata con CSC + +my $d = new HTTP::Daemon LocalPort => 8081; +my $myownurl = $d->url; + +# Let's patch the returned URL +$myownurl =~ s/\/$//; # chop the final slash +my $myownport = $myownurl; +$myownport =~ s/http:\/\/(.*):(.*)/$2/; +$myownurl =~ s/http:\/\/(.*):(.*)/$1/; +($myownurl) = gethostbyname($myownurl); +$myownurl = "http://".$myownurl.":".$myownport; + +tie(%map, 'DB_File', $uris_dbm.".db", O_RDWR, 0664); +print "Please contact me at: \n"; +print "helm_dir: $helm_dir\n"; +print "style_dir: $style_dir\n"; +print "dtd_dir: $dtd_dir\n"; +print "urls_of_uris.db: $uris_dbm.db\n"; +print "cache mode: $cachemode\n"; + +$SIG{CHLD} = "IGNORE"; # do not accumulate defunct processes +$SIG{USR1} = \&update; # sent by the child to make the parent update +while (my $c = $d->accept) { + if (fork() == 0) { + while (my $r = $c->get_request) { + #CSC: mancano i controlli di sicurezza + + my $http_method = $r->method; + my $http_path = $r->url->path; + my $http_query = uri_unescape($r->url->query); + my $cgi = new CGI("$http_query"); + my $inputuri = $cgi->param('uri'); + print "\nRequest: ".$r->url."\n\n"; + + print "\nUnescaped query: ".$http_query."\n"; + + if ($http_method eq 'GET' and $http_path eq "/getxml") { + # finds the uri, url and filename + my $cicuri = $inputuri; + my $answerformat = $cgi->param('format'); + my $patch_dtd = $cgi->param('patch_dtd'); + $answerformat = "" if (not defined($answerformat)); + $patch_dtd = "yes" if (not defined($patch_dtd)); + if (($answerformat ne "gz") and ($answerformat ne "normal") + and ($answerformat ne "")) { + die "Wrong output format: $answerformat, must be 'normal' ". + "or 'gz'\n"; + } + if (($patch_dtd ne "yes") and ($patch_dtd ne "no") + and ($patch_dtd ne "")) { + die "Wrong param, patch_dtd must be 'yes' or 'no'\n"; + } + + my $cicfilename = $cicuri; + $cicfilename =~ s/cic:(.*)/$1/; + $cicfilename =~ s/theory:(.*)/$1/; + + my $cicurl = $map{$cicuri}; + if (not defined($cicurl)) { + die "uri \"$cicuri\" can't be resolved\n"; + } + my $extension; + if ($cicurl =~ /\.xml$/) { # non gzipped file + $extension = ".xml"; + } elsif ($cicurl =~ /\.xml\.gz$/) { # gzipped file + $extension = ".xml.gz"; + } else { # error: unknown extension + die "unexpected extension in url: $cicurl, might be '.xml'". + "or '.xml.gz'"; + } + $cicfilename = $helm_dir.$cicfilename.$extension; + + if (!defined($cicurl)) { + print "\nNOT FOUND!!!!!\n"; + $c->send_error(RC_NOT_FOUND) + } else { + print_request("cic",$cicuri,$cicurl,$cicfilename); + + # Retrieves the file + my $ciccontent = download($patch_dtd,"cic",$cicurl,$cicfilename,$answerformat); + + # Answering the client + if ($answerformat eq "normal") { + answer($c,$ciccontent,"text/xml",""); + } else { + answer($c,$ciccontent,"text/xml","x-gzip"); + } + } + } elsif ($http_method eq 'GET' and $http_path eq "/register") { + my $inputurl = $cgi->param('url'); + print "Register requested...\n"; + $map{$inputuri}=$inputurl; + + # Now let's clean the cache + my $cicfilename = $inputuri; + $cicfilename =~ s/cic:(.*)/$1/; + $cicfilename =~ s/theory:(.*)/$1/; + + print "Unlinking ".$helm_dir.$cicfilename.".xml[.gz]\n"; + unlink ($helm_dir.$cicfilename.".xml"); + unlink ($helm_dir.$cicfilename.".xml.gz"); + + kill(USR1,getppid()); # signal changes to parent + untie %map; + print "done\n"; + html_nice_answer($c,"Register done"); + } elsif ($http_method eq 'GET' and $http_path eq "/resolve") { + my $outputurl = $map{$inputuri}; + $outputurl = "" if (not defined($outputurl)); + $cont = "\n\n"; + if ($outputurl eq "") { + $cont .= "\n"; + } else { + $cont .= "\n"; + } + answer($c,$cont,"text/xml",""); + } elsif ($http_method eq 'GET' and $http_path eq "/getdtd") { + my $filename = $inputuri; + $filename = $dtd_dir."/".$filename; + print "DTD: $inputuri ==> ($filename)\n"; + if (stat($filename)) { + print "Using local copy\n"; + open(FD, $filename) or die "Cannot open $filename\n"; + $cont = ""; + while() { + # Vary bad heuristic for substituion of absolute URLs + # for relative ones + s/ENTITY (.*) SYSTEM\s+"/ENTITY $1 SYSTEM "$myownurl\/getdtd?uri=/g; + $cont .= $_; + } + close(FD); + answer($c,$cont,"text/xml",""); + } else { + die "Could not find DTD!"; + } + } elsif ($http_method eq 'GET' and $http_path eq "/getstyleconf") { + my $filename = $inputuri; + $filename = $style_dir."/config/".$filename; + if (stat($filename)) { + open(FD, $filename) or die "Cannot open $filename\n"; + $cont = ""; + while() { + s/DOCTYPE (.*) SYSTEM\s+"/DOCTYPE $1 SYSTEM "$myownurl\/getstyleconf?uri=/g; + $cont .= $_; + } + close(FD); + answer($c,$cont,"text/plain",""); + } else { + die "Could not find Style Configuration File!"; + } + } elsif ($http_method eq 'GET' and $http_path eq "/getxslt") { + my $filename = $inputuri; + $filename = $style_dir."/".$filename; + print "XSLT: $inputuri ==> ($filename)\n"; + if (stat($filename)) { + print "Using local copy\n"; + open(FD, $filename) or die "Cannot open $filename\n"; + $cont = ""; + while() { + # Vary bad heuristic for substituion of absolute URLs + # for relative ones + s/xsl:import\s+href="/xsl:import href="$myownurl\/getxslt?uri=/g ; + s/xsl:include\s+href="/xsl:include href="$myownurl\/getxslt?uri=/g ; + $cont .= $_; + } + close(FD); + answer($c,$cont,"text/xml",""); + } else { + die "Could not find XSLT!"; + } + } elsif ($http_method eq 'GET' and $http_path eq "/update") { + # rebuild urls_of_uris.db + print "Update requested...\n"; + mk_urls_of_uris(); + kill(USR1,getppid()); # signal changes to parent + print " done\n"; + html_nice_answer($c,"Update done"); + } elsif ($http_method eq 'GET' and $http_path eq "/ls") { + # send back keys that begin with a given uri + my ($uritype,$uripattern,$uriprefix); + my $baseuri = $cgi->param('baseuri'); + chop $baseuri if ($baseuri =~ /.*\/$/); # remove trailing "/" + my $outype = $cgi->param('format'); # output type, might be 'txt' or 'xml' + $uripattern = $baseuri; + $uripattern =~ s/^.*:(.*)/$1/; + if ($baseuri =~ /^cic:/) { + $uritype = "cic"; + } elsif ($baseuri =~ /^theory:/) { + $uritype = "theory"; + } else { + $uritype = "invalid"; + } + if ($uritype ne "invalid") { # uri is valid + if (($outype ne 'txt') and ($outype ne 'xml')) { # invalid out type + print "Invalid output type specified: $outype\n"; + html_nice_answer($c,"Invalid output type, must be ". + "'txt' or 'xml'"); + } else { # valid output + print "BASEURI $baseuri, FORMAT $outype\n"; + $cont = finduris($uritype,$uripattern,$outype); + if ($outype eq 'txt') { + answer($c,$cont,"text/plain",""); + } elsif ($outype eq 'xml') { + answer($c,$cont,"text/xml",""); + } else { + die "Internal error, exiting!"; + } + } + } else { # invalid uri + print "Invalid uri: $baseuri, may begin with 'cic:', ". + "'theory:' or '*:'\n"; + html_nice_answer($c,"Invalid uri , must begin with ". + "'cic:' or 'theory:'"); + } + } elsif ($http_method eq 'GET' and $http_path eq "/help") { + print "Help requested!\n"; + html_nice_answer($c,"HTTP Getter Version: $VERSION"); + } elsif ($http_method eq 'GET' and $http_path =~ /\.cgi$/) { + print "CGI requested!\n"; + if ($http_path !~ /^\/[^\/]*\.cgi$/) { + html_nice_answer($c,"Invalid CGI name: $http_path, ". + "you can't request CGI that contain a slash in the name\n"); + } elsif (stat "$cgi_dir"."$http_path") { + if (not -x "$cgi_dir/$http_path") { + html_nice_answer($c,"CGI $http_path found but not ". + "executable\n"); + } else { # exec CGI and anser back its output + my %old_ENV = %ENV; + %ENV = (); + $ENV{'QUERY_STRING'} = $http_query; + my $cgi_output = `$cgi_dir/$http_path`; + answer($c,$cgi_output,"",""); + %ENV = %old_ENV; + } + } else { + html_nice_answer($c,"CGI '$http_path' not found ". + "in CGI dir '$cgi_dir'"); + } + } else { + print "\n"; + print "INVALID REQUEST!!!!!\n"; + print "(PATH: ",$http_path,", "; + print "QUERY: ",$http_query,")\n"; + $c->send_error(RC_FORBIDDEN) + } + print "\nRequest solved: ".$r->url."\n\n"; + } + $c->close; + undef($c); + print "\nCONNECTION CLOSED\n\n"; + exit; + } # fork +} + +#================================ + +sub finduris { # find uris for cic and theory trees generation + my ($uritype,$uripattern,$format) = @_; + my $content = ""; + my ($uri,$localpart,$basepart,$dirname,$suffix,$flags,$key); + my (@itemz,@already_pushed_dir); + my (%objects,%dirs); # map uris to suffixes' flags + #my $debug=1; # for debug + + print "FINDURIS, uritype: $uritype, uripattern: $uripattern, ". + "format: $format\n\n" if defined($debug); + + if (($uritype eq "cic") or ($uritype eq "theory")) { + # get info only of one type: cic or theory + foreach (keys(%map)) { # select matching uris + $uri = $_; + if ($uri =~ /^$uritype:$uripattern(\/|$|\.)/) { + if ($uri =~ /^$uritype:$uripattern\//) { # directory match + $localpart = $uri; + $localpart =~ s/^$uritype:$uripattern\/(.*)/$1/; + } elsif ($uri =~ /^$uritype:$uripattern($|\.)/) { # file match + $localpart = $uri; + $localpart =~ s/^.*\/([^\/]*)/$1/; + } else { + die "Internal error, seems that requested match is none of ". + "directory match or file match"; + } + print "LOCALPART: $localpart\n" if defined($debug); + + if ($localpart =~ /^[^\/]*$/) { # no slash, an OBJECT + $basepart = $localpart; + $basepart =~ s/^([^.]*\.[^.]*)(\.types)?(\.ann)?/$1/; + # remove exts .types or + # .types.ann + $flags = $objects{$basepart}; # get old flags + if ($localpart =~ /\.ann$/) { + $flags = add_flag("ann","YES",$flags); + } else { + $flags = add_flag("ann","NO",$flags); + } + if ($localpart =~ /\.types$/) { + $flags = add_flag("types","YES",$flags); + } elsif ($localpart =~ /\.types\.ann$/) { + $flags = add_flag("types","ANN",$flags); + } else { + $flags = add_flag("types","NO",$flags); + } + $objects{$basepart} = $flags; # save new flags + } else { # exists at least one slash, a DIR + ($dirname) = split (/\//, $localpart); + $dirs{$dirname} = ""; # no flags requirement for dir + } + } + } + } else { + die "getter internal error: unsupported uritype: \"$uritype\""; + } + # now generate OUTPUT: + # output will be saved in $content + if ($format eq "txt") { # TXT output + foreach $key (sort(keys %dirs)) { + $content .= "dir, " . $key . "\n"; + } + foreach $key (sort(keys %objects)) { + $content .= "object, $key, " . $objects{$key} . "\n"; + } + } elsif ($format eq "xml") { # XML output + $content .= '' . "\n"; + $content .= "" . "\n\n"; + $content .= "\n"; + foreach $key (sort(keys %dirs)) { + $content .= "\t
$key
\n"; + } + foreach $key (sort(keys %objects)) { + $content .= "\t\n"; + $flags = $objects{$key}; + $flags =~ s/^<(.*)>$/$1/; + my ($annflag,$typesflag) = split /,/,$flags; + $content .= "\t\t\n"; + $content .= "\t\t\n"; + $content .= "\t\n"; + } + $content .= "
\n"; + } else { # may not enter this branch + die "Getter internal error: invalid format received by finduris sub"; + } + return $content; +} + +sub add_flag { +# manage string like: "" +# "ann_flag" may be one of "ann_YES", "ann_NO" +# "type_flag" may be one of "types_NO", "types_YES", "types_ANN" +# when adding a flag the max between the current flag and the new flag +# is taken, the orders are ann_NO < ann_YES and types_NO < types_YES < +# types_ANN + my ($flagtype,$newflag,$str) = @_; + $str = "<,>" if ($str eq ""); + ($str =~ s/^<(.*,.*)>$/$1/) or die "Internal error: ". + "wrong string format for flag adding in $str"; + my ($annflag,$typeflag) = split /,/,$str; + if ($flagtype eq "ann") { # ANN flag handling + if ($newflag eq "YES") { + $annflag = "YES"; + } elsif ($newflag eq "NO") { + $annflag = "NO" unless ($annflag eq "YES"); + } else { + die "Internal error: annflag must be \"YES\" or \"NO\""; + } + } elsif ($flagtype eq "types") { # TYPES flag handling + if ($newflag eq "ANN") { + $typeflag = "ANN"; + } elsif ($newflag eq "YES") { + $typeflag = "YES" unless ($typeflag eq "ANN"); + } elsif ($newflag eq "NO") { + $typeflag = "NO" + unless (($typeflag eq "ANN") or ($typeflag eq "YES")); + } else { + die "Internal error: typeflag must be \"YES\", \"NO\" or \"ANN\""; + } + } else { + die "Internal error: unsupported flagtype \"$flagtype\""; + } + $str = "<$annflag,$typeflag>"; +} + +#CSC: Too much powerful: creates even /home, /home/users/, ... +#CSC: Does not raise errors if could not create dirs/files +sub mkdirs +{ + my ($pathname) = @_; + my @dirs = split /\//,$pathname; + my $tmp; + foreach $dir (@dirs) { + $tmp = ((defined($tmp)) ? $tmp."\/".$dir : ""); + mkdir($tmp,0777); + } + rmdir($tmp); +} + +sub print_request +{ + my ($str,$uri,$url,$filename) = @_; + print $str."uri: $uri\n"; + print $str."url: $url\n"; + print $str."filename: $filename\n\n"; +} + +sub callback +{ + my ($data) = @_; + $cont .= $data; +} + +sub gunzip { # gunzip a file and return the deflated content + my ($filename) = @_; + + my ($gz, $buffer, $cont); + + print "deflating $filename ...\n"; + $gz = gzopen($filename, "r") + or die "Cannot open gzip'ed file $filename: $gzerrno"; + $cont = ""; + while ( $gz->gzread($buffer) > 0 ) { + $cont .= $buffer; + } + die "Error while reading : $gzerrno\n" if $gzerrno != Z_STREAM_END ; + $gz->gzclose(); + + return $cont; +} + +sub gzip { # gzip the content argument and save it to filename argument + my ($cont, $filename) = @_; + #my $debug=1; # debug only + + print "gzopening $filename ...\n" if (defined($debug)); + my $gz = gzopen($filename, "w") + or die "Cannot gzopen for writing file $filename: $gzerrno"; + print "gzwriting ...\n" if (defined($debug)); + $gz->gzwrite($cont) or die "error writing: $gzerrno, exiting!\n"; + print "gzclosing ...\n" if (defined($debug)); + $gz->gzclose(); +} + +sub download { + my ($patch_dtd,$str,$url,$filename,$format) = @_; + my ($gz, $buffer); + + #my $debug=1; # for DEBUG only + + my $resourcetype; # retrieve mode: "normal" (.xml) or "gzipped" (.xml.gz) + if ($filename =~ /\.xml$/) { # set retrieve mode + $resourcetype = "normal"; + } elsif ($filename =~ /\.xml\.gz$/) { + $resourcetype = "gzipped"; + } else { + die "Unsupported download extension, might be '.gz' or '.xml'\n"; + } + my $basefname = $filename; + $basefname =~ s/\.gz$//; # get base resource name removing trailing .gz + $cont = ""; # modified by side-effect by the callback function + + my $localfname=""; + if (stat($basefname)) { + $localfname=$basefname; + } elsif (stat($basefname.".gz")) { + $localfname=$basefname.".gz"; + } + if ($localfname ne "") { # we already have local copy of requested file + # check both possible cache entry: gzipped or normal + print "Using local copy for the $str file\n"; + if ($localfname =~ /\.xml\.gz$/) { # deflating cached file and return it + $cont = gunzip($localfname); + } elsif ($localfname =~ /\.xml$/) { # just return cached file + open(FD, $localfname) or die "Cannot open $localfname"; + while() { $cont .= $_; } + close(FD); + } else { # error + die "Internal error: unexpected file name $localfname," + ."must end with '.gz' or '.xml.gz'\n"; + } + } else { # download file from net + print "Downloading the $str file\n"; # download file + my $ua = LWP::UserAgent->new; + my $request = HTTP::Request->new(GET => "$url"); + my $response = $ua->request($request, \&callback); + + # cache retrieved file to disk + # TODO: inefficent, I haven't yet undestood how to deflate + # in memory gzipped file, without call "gzopen" + print "Storing the $str file\n"; + print "Making dirs ...\n" if (defined($debug)); + mkdirs($filename); + print "Opening tmp file for writing ...\n" if (defined($debug)); + open(FD, ">".$filename.".tmp") or die "Cannot open $filename.tmp\n"; + print "Writing on tmp file ...\n" if (defined($debug)); + print FD $cont; + print "Closing tmp file ...\n" if (defined($debug)); + close(FD); + + # handle cache conversion normal->gzipped or gzipped->normal as user choice + print "cachemode:$cachemode, resourcetype:$resourcetype\n" if (defined($debug)); + if (($cachemode eq 'normal') and ($resourcetype eq 'normal')) { + # cache the file as is + rename "$filename.tmp", $filename; + } elsif (($cachemode eq 'gzipped') and ($resourcetype eq 'gzipped')) { + # cache the file as is + # and update the $cont variabile with deflated content + rename "$filename.tmp", $filename; + $cont = gunzip($filename); + } elsif (($cachemode eq 'normal') and ($resourcetype eq 'gzipped')) { + # deflate cache entry + # and update $cont + open(FD, "> $basefname") or die "cannot open $basefname\n"; + $cont = gunzip($filename.".tmp"); + print FD $cont; + close(FD); + unlink "$filename.tmp"; # delete old gzipped file + } elsif (($cachemode eq 'gzipped') and ($resourcetype eq 'normal')) { + # compress cache entry + print "gzipping ...\n" if (defined($debug)); + gzip($cont, $basefname.".gz"); + unlink "$filename.tmp"; # delete old uncompressed file + } else { + die "Internal error, unsopported cachemode, resourcetype couple\n"; + } + # $cont now contained uncompressed data + } + if ($patch_dtd eq "yes") { + $cont =~ s/DOCTYPE (.*) SYSTEM\s+"http:\/\/www.cs.unibo.it\/helm\/dtd\//DOCTYPE $1 SYSTEM "$myownurl\/getdtd?uri=/g; + } + if ($format eq "gz") { + gzip($cont,"$basefname.tmp"); + open (TMP, "< $basefname.tmp") + or die "Can't open tempfile: $filename.tmp, exiting!\n"; + $cont = ""; + while() { + $cont .= $_; + } + close TMP; + unlink ($basefname . ".tmp") or + die "cannot unlink temporary file: $basefname.tmp\n"; + } + + return $cont; +} + +sub answer { +# answer the client setting content, Content-Type and Content-Enconding +# of the answer + my ($c,$cont,$contype,$contenc) = @_; + my $res = new HTTP::Response; + $res->content($cont); + $res->push_header("Content-Type" => $contype) + unless ($contype eq ""); + $res->push_header("Content-Encoding" => $contenc) + unless ($contenc eq ""); + $res->push_header("Cache-Control" => "no-cache"); + $res->push_header("Pragma" => "no-cache"); + $res->push_header("Expires" => "0"); + $c->send_response($res); +} + +sub html_nice_answer { +# answer the client whith a nice html document + my ($c,$content) = @_; + $content = "

$content

"; + answer($c,$content,"text/html",""); +} + +sub helm_wget { +#retrieve a file from an url and write it to a temp dir +#used for retrieve resource index from servers + $cont = ""; + my ($prefix, $URL) = @_; + my $ua = LWP::UserAgent->new; + my $request = HTTP::Request->new(GET => "$URL"); + my $response = $ua->request($request, \&callback); + my ($filename) = reverse (split "/", $URL); # get filename part of the URL + open (TEMP, "> $prefix/$filename") + || die "Cannot open temporary file: $prefix/$filename\n"; + print TEMP $cont; + close TEMP; +} + +sub update { + untie %map; + tie(%map, 'DB_File', $uris_dbm.".db", O_RDWR, 0664); +} + +sub mk_urls_of_uris { +#rebuild $uris_dbm.db fetching resource indexes from servers + my ( + $server, $idxfile, $uri, $url, $comp, $line, + @servers, + %urls_of_uris + ); + + untie %map; + if (stat $uris_dbm.".db") { # remove old db file + unlink($uris_dbm.".db") or + die "cannot unlink old db file: $uris_dbm.db\n"; + } + tie(%urls_of_uris, 'DB_File', $uris_dbm.".db", O_RDWR|O_CREAT, 0664); + + open (SRVS, "< $servers_file") or + die "cannot open servers file: $servers_file\n"; + @servers = ; + close (SRVS); + while ($server = pop @servers) { #cicle on servers in reverse order + print "processing server: $server ...\n"; + chomp $server; + helm_wget($tmp_dir, $server."/".$indexname); #get index + $idxfile = $tmp_dir."/".$indexname; + open (INDEX, "< $idxfile") or + die "cannot open temporary index file: $idxfile\n"; + while ($line = ) { #parse index and add entry to urls_of_uris + chomp $line; + ($uri,$comp) = split /[ \t]+/, $line; + # build url: + if ($comp =~ /gz/) { + $url = $uri . ".xml" . ".gz"; + } else { + $url = $uri . ".xml"; + } + $url =~ s/cic:/$server/; + $url =~ s/theory:/$server/; + $urls_of_uris{$uri} = $url; + } + close INDEX; + die "cannot unlink temporary file: $idxfile\n" + if (unlink $idxfile) != 1; + } + + untie(%urls_of_uris); + tie(%map, 'DB_File', $uris_dbm.".db", O_RDWR, 0664); +} + diff --git a/helm/metadata/create/split.pl b/helm/metadata/create/split.pl new file mode 100755 index 000000000..9b3a3746e --- /dev/null +++ b/helm/metadata/create/split.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl + +foreach $inputfile (@ARGV) { + print "Now splitting file $inputfile\n"; + open(IN, "<$inputfile"); + while($line = ) { + if (not ($line =~ /^/)) { + if ($line =~ /^$/$1/; + $line =~ s/^cic:/rdf/; + $line =~ s/#xpointer\(1\/([^\/]*)\/([^\/]*)\)/,$1,$2/; + $line =~ s/#xpointer\(1\/([^\/]*)\)/,$1/; + $dir = $line; + $dir =~ s/\/[^\/]*$//; + close(OUT); + system("mkdir -p $dir"); + open(OUT, ">>$line"); + } else { + print OUT $line; + } + } + } + close(IN); +} +close(OUT); diff --git a/helm/metadata/create/uris_of_filenames.pl b/helm/metadata/create/uris_of_filenames.pl new file mode 100755 index 000000000..f67d30d4f --- /dev/null +++ b/helm/metadata/create/uris_of_filenames.pl @@ -0,0 +1,14 @@ +#!/usr/bin/perl + +while() { + chomp; + split / /; + for (@_) { + $GZSUFF = ""; + if (/.gz$/) + { s/.gz$//; $GZSUFF = " gz" if ($ARGV[0] == "-gz"); } + s/\./helm:rdf:www.cs.unibo.it\/helm\/rdf\/rdfprova\/\/cic:/; + s/\.xml//; + print $_.$GZSUFF."\n"; + } +} -- 2.39.2