From d415fd267ae0991619f64412c3b388af4975317e Mon Sep 17 00:00:00 2001 From: Claudio Sacerdoti Coen Date: Thu, 13 Dec 2001 14:21:33 +0000 Subject: [PATCH] Generation of forward metadata using a lexical analyser. --- helm/metadata/create4/METADATA/Makefile | 17 ++ helm/metadata/create4/METADATA/meta_lex.l | 263 +++++++++++++++++++++ helm/metadata/create4/METADATA/sthandler.c | 263 +++++++++++++++++++++ helm/metadata/create4/METADATA/sthandler.h | 8 + helm/metadata/create4/Makefile | 27 +++ 5 files changed, 578 insertions(+) create mode 100644 helm/metadata/create4/METADATA/Makefile create mode 100644 helm/metadata/create4/METADATA/meta_lex.l create mode 100644 helm/metadata/create4/METADATA/sthandler.c create mode 100644 helm/metadata/create4/METADATA/sthandler.h create mode 100644 helm/metadata/create4/Makefile diff --git a/helm/metadata/create4/METADATA/Makefile b/helm/metadata/create4/METADATA/Makefile new file mode 100644 index 000000000..160f0bbf9 --- /dev/null +++ b/helm/metadata/create4/METADATA/Makefile @@ -0,0 +1,17 @@ +CC = gcc + +meta: lex.yy.o sthandler.o + gcc lex.yy.o sthandler.o -o meta + +lex.yy.c: meta_lex.l sthandler.h + flex meta_lex.l + +sthandler.o: sthandler.c sthandler.h + +lex.yy.o: lex.yy.c sthandler.h + gcc -c lex.yy.c + +clean: + -rm *.o + -rm lex.yy.c + -rm meta diff --git a/helm/metadata/create4/METADATA/meta_lex.l b/helm/metadata/create4/METADATA/meta_lex.l new file mode 100644 index 000000000..7c3d0b43f --- /dev/null +++ b/helm/metadata/create4/METADATA/meta_lex.l @@ -0,0 +1,263 @@ + /******************************************************************/ + /* Copyright (C) 2000, HELM Team */ + /* */ + /* This file is part of HELM, an Hypertextual, Electronic */ + /* Library of Mathematics, developed at the Computer Science */ + /* Department, University of Bologna, Italy. */ + /* */ + /* HELM is free software; you can redistribute it and/or */ + /* modify it under the terms of the GNU General Public License */ + /* as published by the Free Software Foundation; either version */ + /* 2 of the License, or (at your option) any later version. */ + /* */ + /* HELM is distributed in the hope that it will be useful, */ + /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ + /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ + /* GNU General Public License for more details. */ + /* */ + /* You should have received a copy of the GNU General Public */ + /* License along with HELM; if not, write to the Free Software */ + /* Foundation, Inc., 59 Temple Place - Suite 330, Boston, */ + /* MA 02111-1307, USA. */ + /* */ + /* For details, see the HELM World-Wide-Web page, */ + /* http://cs.unibo.it/helm/. */ + /******************************************************************/ + + /***************************************************************/ + /* META_LEXAN */ + /* Automatic Metadata Extractor */ + /* First draft 11/12/2001, by Andrea Asperti */ + /***************************************************************/ + + /***************************************************************/ + /* 1. Inclusion of header files. */ + /***************************************************************/ + +%{ +#include +#include +#include "sthandler.h" +%} + + /***************************************************************/ + /* 2. Constants and Variables Definitions */ + /***************************************************************/ + +%{ +#define NOWHERE 0 +#define CONST 1 +#define MUTIND 2 +#define MUTCONSTRUCT 3 + +#define INBODY 0 +#define MAINHYP 1 +#define INHYP 2 +#define INCONCL 3 +#define MAINCONCL 4 +#define INTYPE 5 +#define NOTFOUND 6 + +#define BEFORE 0 +#define HERE 1 +#define AFTER 2 + + +int where = NOWHERE; +int found = NOTFOUND; +int position = INBODY; +int first_child = BEFORE; +int no_open_source =0; +int tmp_n; +char sep = '"'; +char *xpointer = "#xpointer(1/"; +char *uri; +char *tmp; +%} + + /***************************************************************/ + /* 3. Regular definitions. */ + /***************************************************************/ + +uri [^"]+ +digits [0-9]+ + + /***************************************************************/ + /* 4. Rules. */ + /***************************************************************/ + + + +%% + +"" { + position = INTYPE; + first_child = BEFORE; + } + +"" { + if (position == INHYP) + { + no_open_source--; + /* printf("source %d\n", no_open_source); */ + if (no_open_source == 0) + { position = INTYPE; + first_child = BEFORE; }; + }; + } + + +"" { + position = INBODY; + } + +.|\n { + } + +"\n\n"); + printf("\n"); + printf("\n"); + print_all(); + printf("\n"); + printf("\n"); + } + +search(uri,first_child,position) +char *uri; +int first_child; +int position; +{ + if (first_child == HERE) + { + if (position == INHYP) + found = search_bucket(uri,MAINHYP); + else if (position == INCONCL) + found = search_bucket(uri,MAINCONCL); + /* if (found == NOTFOUND) + printf( "pos = %d, uri = %s\n", MAINCONCL, uri); */ + } + else found = search_bucket(uri,position); + /* if (found == NOTFOUND) + printf( "pos = %d, uri = %s\n", position, uri); */ + } + +int yywrap() { + return 1; + } + + + + + + + diff --git a/helm/metadata/create4/METADATA/sthandler.c b/helm/metadata/create4/METADATA/sthandler.c new file mode 100644 index 000000000..6eb2b0eee --- /dev/null +++ b/helm/metadata/create4/METADATA/sthandler.c @@ -0,0 +1,263 @@ +/*********************************************************************/ +/* Copyright (C) 2000, HELM Team */ +/* */ +/* This file is part of HELM, an Hypertextual, Electronic */ +/* Library of Mathematics, developed at the Computer Science */ +/* Department, University of Bologna, Italy. */ +/* */ +/* HELM is free software; you can redistribute it and/or */ +/* modify it under the terms of the GNU General Public License */ +/* as published by the Free Software Foundation; either version 2 */ +/* of the License, or (at your option) any later version. */ +/* */ +/* HELM is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with HELM; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place - Suite 330, Boston, */ +/* MA 02111-1307, USA. */ +/* */ +/* For details, see the HELM World-Wide-Web page, */ +/* http://cs.unibo.it/helm/. */ + /*********************************************************************/ + +/****************************************************************/ +/* STHANDLER.C */ +/****************************************************************/ +/* This module supplies routines for symbol table handling. */ +/* - init_symbol_table(): it initializes the symbol table */ +/* to void. */ +/* - search_bucket(): it searches the symbol table for the */ +/* bucket containing a given identifier, and */ +/* inserts it if it is not present; */ +/****************************************************************/ +/* First draft 11/12/2001, by Andrea Asperti */ +/****************************************************************/ + +/****************************************************************/ +/* 1. Inclusion of header files. */ +/****************************************************************/ + +#include +#include + +/****************************************************************/ +/* 2. Declarations */ +/****************************************************************/ + + +#define DICTSIZE 211 +#define HASH1 4 +#define HASH2 0xf0000000 +#define HASH3 24 +#define EOS '\0' + +#define INBODY 0 +#define MAINHYP 1 +#define INHYP 2 +#define INCONCL 3 +#define MAINCONCL 4 +#define INTYPE 5 +#define NOTFOUND 6 + +/****************************************************************/ +/* 3. Types. */ +/****************************************************************/ + +struct st_bucket { + char *id; + /* identifier */ + struct st_bucket *next_st_bucket; + /* next bucket in the list */ + struct st_bucket *all_next; + /* all buckets in symbol + table are linked together */ + int pos[5]; + + }; + +struct st_bucket *dictionary[DICTSIZE]; + /* pointers to bucket lists */ + +/****************************************************************/ +/* 4. Definitions of functions to be exported. */ +/****************************************************************/ + +struct st_bucket *all; + + /* The following function initializes the symbol table to NULL */ +void init_symbol_table() +{ + struct st_bucket *st; + int i; + + /* initialize the dictionary */ + for (i = 0; i < DICTSIZE; i++) + dictionary[i] = NULL; + all = NULL; +} + + /* The following function searches the symbol table for an identifier */ + /* and inserts it if it is not present. + /* The bucket associated with the given identifier */ + /* becomes the first one in its list. */ + +search_bucket(id, where) + char *id; + /* identifier */ + int where; +{ + int dict_index; + /* value returned by the */ + /* hash function */ + struct st_bucket + *prev, + *curr; + + struct st_bucket *st; + + /* apply the hash function */ + dict_index = hash_pjw(id); + /* printf( "%d\n", dict_index); */ + + /* scan the bucket list indicated by the hash function */ + prev = curr = dictionary[dict_index]; + while ((curr != NULL) && (strcmp(id, curr->id))) + { + prev = curr; + curr = curr->next_st_bucket; + } + if (curr == NULL) + /* the identifier is not in the list */ + { + allocate_bucket(&st,id,where); + move_bucket(st,dict_index); + return NOTFOUND; + } + else + /* printf("uno=%s\n", id); + printf("st=%s\n", curr->id); */ + + /* the identifier is already in the list */ + { + /* st = curr; */ + curr->pos[where] = 1; + if (where >= 1) + curr->pos[0] = 0; /* it will never be set again to 1 */ + if (prev != curr) + /* the identifier is not in the first position */ + { + prev->next_st_bucket = curr->next_st_bucket; + move_bucket(curr, + dict_index); + }; + return where; + } +} + +print_all() +{ + int i; + struct st_bucket *curr; + curr = all; + + while (curr != NULL) + { + for (i = 0; i < 5; ++i) + if (curr->pos[i] == 1) + print_one(curr->id,i); + curr = curr->all_next; + } +} + + +/****************************************************************/ +/* 5. Definitions of functions local to the module. */ +/****************************************************************/ + +print_one(uri,pos) + char *uri; + int pos; +{ + printf("\n"); + printf("\n"); + printf("\n"); +} + + /* The following function allocates a bucket for an identifier. */ +allocate_bucket(st, id, where) + struct st_bucket + **st; + /* pointer to the bucket to be */ + /* allocated */ + char *id; + /* identifier */ + int where; +{ + int i; + + *st = (struct st_bucket *)malloc(sizeof(struct st_bucket)); + (*st)->id = (char *)malloc(sizeof('a')*strlen(id)); + strcpy((*st)->id,id); + (*st)->next_st_bucket = NULL; + (*st)->all_next = all; + all = *st; + for (i = 0; i < 5; ++i) + (*st)->pos[i] = 0; + (*st)->pos[where] = 1; +} + + /* The following function moves a bucket to the head of the */ + /* list in which it lies. */ +move_bucket(st, dict_index) + struct st_bucket + *st; + /* pointer to the bucket to */ + /* be moved */ + int dict_index; + /* index corresponding to */ + /* the list in which the */ + /* bucket lies */ +{ + st->next_st_bucket = dictionary[dict_index]; + dictionary[dict_index] = st; +} + + /* The following function implements Weinberger's hash function. */ +int +hash_pjw(id) + char *id; + /* identifier to be hashed */ +{ + unsigned h, + g; + + for (h = 0; *id != EOS; id++) + { + h = (h << HASH1) + (*id); + if (g = h & HASH2) + h = h ^ (g >> HASH3) ^ g; + } + return(h % DICTSIZE); +} + + + + + diff --git a/helm/metadata/create4/METADATA/sthandler.h b/helm/metadata/create4/METADATA/sthandler.h new file mode 100644 index 000000000..d4e17d587 --- /dev/null +++ b/helm/metadata/create4/METADATA/sthandler.h @@ -0,0 +1,8 @@ +/****************************************************************/ +/* STHANDLER.H */ +/****************************************************************/ + + +extern void init_symbol_table(); +extern void print_all(); +extern int search_bucket(); diff --git a/helm/metadata/create4/Makefile b/helm/metadata/create4/Makefile new file mode 100644 index 000000000..30f0540d3 --- /dev/null +++ b/helm/metadata/create4/Makefile @@ -0,0 +1,27 @@ +all: + @echo Available targets: + @echo " forward, backward, compress, clean-forward, clean-backward" + +forward: + time for i in `cat pluto` ; do (cd tmp ; wget -t 1 "http://phd.cs.unibo.it:8081/getxml?format=gz&uri=$$i") ; mkdir -p forward/`dirname $$i | sed "s/cic:\///"` ; zcat tmp/`basename $$i` | METADATA/meta `basename $$i` > forward/`echo $$i | sed "s/cic:\///"` ; rm tmp/`basename $$i` ; done > log 2>&1 + (cd forward ; ../mkindex.sh forward) + +backward: + time for i in `cat pluto` ; do touch/touch.opt $$i ; done + find forward -type f -exec ./invert.pl {} \; + find backward -type f -exec ./fix_rdf.pl {} \; + (cd backward ; ../mkindex.sh backward) + +compress: + find forward -name "*.xml" -exec gzip {} \; + find backward -name "*.xml" -exec gzip {} \; + (cd forward ; ../mkindex.sh forward) + (cd backward ; ../mkindex.sh backward) + +clean-forward: + rm -rf forward/* + +clean-backward: + rm -rf backward/* + +.PHONY: all forward backward compress clean-forward clean-backward -- 2.39.2