]> matita.cs.unibo.it Git - helm.git/commitdiff
Generation of forward metadata using a lexical analyser.
authorClaudio Sacerdoti Coen <claudio.sacerdoticoen@unibo.it>
Thu, 13 Dec 2001 14:21:33 +0000 (14:21 +0000)
committerClaudio Sacerdoti Coen <claudio.sacerdoticoen@unibo.it>
Thu, 13 Dec 2001 14:21:33 +0000 (14:21 +0000)
helm/metadata/create4/METADATA/Makefile [new file with mode: 0644]
helm/metadata/create4/METADATA/meta_lex.l [new file with mode: 0644]
helm/metadata/create4/METADATA/sthandler.c [new file with mode: 0644]
helm/metadata/create4/METADATA/sthandler.h [new file with mode: 0644]
helm/metadata/create4/Makefile [new file with mode: 0644]

diff --git a/helm/metadata/create4/METADATA/Makefile b/helm/metadata/create4/METADATA/Makefile
new file mode 100644 (file)
index 0000000..160f0bb
--- /dev/null
@@ -0,0 +1,17 @@
+CC = gcc
+
+meta: lex.yy.o sthandler.o
+       gcc lex.yy.o sthandler.o -o meta
+
+lex.yy.c: meta_lex.l sthandler.h
+       flex meta_lex.l 
+
+sthandler.o: sthandler.c sthandler.h
+
+lex.yy.o: lex.yy.c sthandler.h
+       gcc -c lex.yy.c 
+
+clean:
+       -rm *.o 
+       -rm lex.yy.c
+       -rm meta
diff --git a/helm/metadata/create4/METADATA/meta_lex.l b/helm/metadata/create4/METADATA/meta_lex.l
new file mode 100644 (file)
index 0000000..7c3d0b4
--- /dev/null
@@ -0,0 +1,263 @@
+ /******************************************************************/
+ /*  Copyright (C) 2000, HELM Team                                 */ 
+ /*                                                                */
+ /* This file is part of HELM, an Hypertextual, Electronic         */
+ /* Library of Mathematics, developed at the Computer Science      */
+ /* Department, University of Bologna, Italy.                      */
+ /*                                                                */
+ /* HELM is free software; you can redistribute it and/or          */
+ /* modify it under the terms of the GNU General Public License    */
+ /* as published by the Free Software Foundation; either version   */
+ /* 2 of the License, or (at your option) any later version.       */
+ /*                                                                */
+ /* HELM is distributed in the hope that it will be useful,        */
+ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
+ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the   */
+ /* GNU General Public License for more details.                   */
+ /*                                                                */
+ /* You should have received a copy of the GNU General Public      */
+ /* License along with HELM; if not, write to the Free Software    */
+ /* Foundation, Inc., 59 Temple Place - Suite 330, Boston,         */
+ /* MA  02111-1307, USA.                                           */
+ /*                                                                */
+ /* For details, see the HELM World-Wide-Web page,                 */
+ /* http://cs.unibo.it/helm/.                                      */
+ /******************************************************************/
+
+ /***************************************************************/
+ /*                       META_LEXAN                           */
+ /*                 Automatic Metadata Extractor                */
+ /*           First draft 11/12/2001, by Andrea Asperti         */
+ /***************************************************************/
+
+ /***************************************************************/
+ /* 1. Inclusion of header files.                              */
+ /***************************************************************/
+
+%{
+#include                <string.h>
+#include                <stdlib.h>
+#include                "sthandler.h"
+%}
+
+ /***************************************************************/
+ /* 2. Constants and Variables Definitions                      */
+ /***************************************************************/
+
+%{
+#define                 NOWHERE   0
+#define                 CONST     1
+#define                 MUTIND    2
+#define                 MUTCONSTRUCT  3
+
+#define                 INBODY    0
+#define                 MAINHYP   1
+#define                 INHYP     2
+#define                 INCONCL   3
+#define                 MAINCONCL 4
+#define                 INTYPE    5
+#define                 NOTFOUND  6
+
+#define                 BEFORE    0
+#define                 HERE      1     
+#define                 AFTER     2
+
+
+int                    where = NOWHERE;
+int                     found = NOTFOUND;
+int                     position = INBODY;
+int                     first_child = BEFORE;
+int                     no_open_source =0;
+int                     tmp_n;
+char                    sep = '"';
+char                    *xpointer = "#xpointer(1/";
+char                    *uri;
+char                    *tmp;
+%}
+
+ /***************************************************************/
+ /* 3. Regular definitions.                                    */
+ /***************************************************************/
+
+uri                     [^"]+
+digits                  [0-9]+                   
+
+ /***************************************************************/
+ /* 4. Rules.                                                  */
+ /***************************************************************/
+
+
+
+%%
+
+"<type>"           { 
+                     position = INTYPE;
+                     first_child = BEFORE;
+                   }
+
+"<source"          {
+                    if ((position == INTYPE) ||
+                        (position == INHYP))
+                        { position = INHYP;
+                          no_open_source++;};
+                    /* printf("source %d\n", no_open_source); */
+                   }
+
+"</source>"        {
+                    if (position == INHYP)
+                     {
+                      no_open_source--;
+                      /* printf("source %d\n", no_open_source); */
+                      if (no_open_source == 0) 
+                       { position = INTYPE;
+                         first_child = BEFORE; };
+                     };
+                   }
+
+
+"<body>"           { 
+                     position = INBODY;
+                   }
+
+.|\n               {
+                   }
+
+"<APPLY"           {
+                    if (first_child == BEFORE)
+                          first_child = HERE;
+                    else if (first_child == HERE)
+                          first_child = AFTER; 
+                   } 
+
+"<LAMBDA"          |
+"<REL"             |
+"<MUTCASE"         |
+"<FIX"             |
+"<COFIX"           { 
+                    if ((first_child == BEFORE) || (first_child == HERE))
+                          first_child = AFTER;
+                   }
+
+"<CONST"           { 
+                     if (position == INTYPE) /* CONST on the spine */
+                        position = INCONCL;
+                     if (first_child == BEFORE)
+                          first_child = HERE;
+                     where = CONST;
+                   }
+
+"<MUTIND"          { 
+                     if (position == INTYPE) /* MUTIND on the spine */
+                        position = INCONCL;
+                     if (first_child == BEFORE)
+                          first_child = HERE;
+                     where = MUTIND;
+                   }
+
+"<MUTCONSTRUCT"    { 
+                     if (position == INTYPE) /* MUTCONSTRUCT on the spine */
+                        position = INCONCL;
+                     if (first_child == BEFORE)
+                          first_child = HERE;     
+                     where = MUTCONSTRUCT;
+                   }
+
+"uri=\""{uri}      {     
+                         uri=(char *)malloc((sizeof('a')*200)); 
+                         strcpy(uri,yytext);
+                         strsep(&uri,&sep);
+                         if (where == CONST)
+                             {
+                                search(uri,first_child,position); 
+                                where = NOWHERE;
+                                first_child = AFTER;
+                                free(uri); 
+                              };
+                   } 
+
+"noType=\""{digits} {
+                         if ((where == MUTIND) || (where == MUTCONSTRUCT))
+                          { strsep(&yytext,&sep);
+                            tmp=(char *)malloc((sizeof(sep)*(strlen(yytext)+1)));
+                            strcpy(tmp,yytext);
+                            tmp_n = atoi(tmp)+1;
+                            sprintf(tmp,"%d",tmp_n);
+                            strcat(uri,"#xpointer(1/"); 
+                            strcat(uri,tmp); 
+                          };
+                         if (where == MUTIND) 
+                             { 
+                               strcat(uri,")");
+                               search(uri,first_child,position); 
+                               free(uri);
+                               free(tmp);
+                               where = NOWHERE; 
+                               first_child = AFTER;};
+                   } 
+
+"noConstr=\""{digits} {
+                         if (where == MUTCONSTRUCT)
+                          { strsep(&yytext,&sep);
+                            tmp=(char *)malloc((sizeof(sep)*(strlen(yytext)+1)));
+                            strcpy(tmp,yytext);
+                            strcat(uri,"/");
+                            strcat(uri,tmp);
+                            strcat(uri,")");
+                            search(uri,first_child,position);
+                            free(uri);
+                            free(tmp);
+                            where = NOWHERE; 
+                            first_child = AFTER;};
+                   } 
+
+
+
+%%
+
+ /***************************************************************/
+ /* 6. Auxiliary functions.                                    */
+ /***************************************************************/
+
+main(int argc, char *argv[])
+{                  
+                   init_symbol_table();
+                   yylex();
+                   printf("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n\n");
+                   printf("<rdf:RDF xml:lang=\"en\" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" xmlns:h=\"http:/www.cs.unibo.it/helm/schemas/schema-h.rdf#\">\n");
+                   printf("<h:Object rdf:about=\"");
+                   printf("%s",argv[1]);
+                   printf("\">\n");
+                   print_all();
+                   printf("</h:Object>\n");
+                   printf("</rdf:RDF>\n");
+                   } 
+
+search(uri,first_child,position)
+char               *uri;
+int                first_child;
+int                position; 
+{                  
+                   if (first_child == HERE)
+                      {
+                       if (position == INHYP)
+                          found = search_bucket(uri,MAINHYP);
+                       else if (position == INCONCL)
+                          found = search_bucket(uri,MAINCONCL);
+                       /* if (found == NOTFOUND)
+                          printf( "pos = %d, uri = %s\n", MAINCONCL, uri); */
+                       }
+                    else found = search_bucket(uri,position);
+                    /* if (found == NOTFOUND)
+                          printf( "pos = %d, uri = %s\n", position, uri); */
+                    }
+
+int yywrap() {
+               return 1;
+             }
+
+
+
+
+
+
+
diff --git a/helm/metadata/create4/METADATA/sthandler.c b/helm/metadata/create4/METADATA/sthandler.c
new file mode 100644 (file)
index 0000000..6eb2b0e
--- /dev/null
@@ -0,0 +1,263 @@
+/*********************************************************************/
+/*  Copyright (C) 2000, HELM Team                                    */ 
+/*                                                                   */
+/* This file is part of HELM, an Hypertextual, Electronic            */
+/* Library of Mathematics, developed at the Computer Science         */
+/* Department, University of Bologna, Italy.                         */
+/*                                                                   */
+/* HELM is free software; you can redistribute it and/or             */
+/* modify it under the terms of the GNU General Public License       */
+/* as published by the Free Software Foundation; either version 2    */
+/* of the License, or (at your option) any later version.            */
+/*                                                                   */
+/* HELM is distributed in the hope that it will be useful,           */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of    */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the     */
+/* GNU General Public License for more details.                      */
+/*                                                                   */
+/* You should have received a copy of the GNU General Public License */
+/* along with HELM; if not, write to the Free Software               */
+/* Foundation, Inc., 59 Temple Place - Suite 330, Boston,            */
+/* MA  02111-1307, USA.                                              */
+/*                                                                   */
+/* For details, see the HELM World-Wide-Web page,                    */
+/* http://cs.unibo.it/helm/.                                         */
+ /*********************************************************************/
+
+/****************************************************************/
+/*                       STHANDLER.C                           */
+/****************************************************************/
+/* This module supplies routines for symbol table handling.    */
+/* - init_symbol_table(): it initializes the symbol table      */
+/*                       to void.                              */
+/* - search_bucket(): it searches the symbol table for the     */
+/*                   bucket containing a given identifier, and */
+/*                   inserts it if it is not present;          */
+/****************************************************************/
+/*           First draft 11/12/2001, by Andrea Asperti          */
+/****************************************************************/
+
+/****************************************************************/
+/* 1. Inclusion of header files.                               */
+/****************************************************************/
+
+#include               <stdio.h>
+#include               <malloc.h>
+
+/****************************************************************/
+/* 2. Declarations                                              */
+/****************************************************************/
+
+
+#define         DICTSIZE                        211
+#define         HASH1                           4
+#define         HASH2                           0xf0000000
+#define         HASH3                           24
+#define         EOS                             '\0'
+
+#define                 INBODY    0
+#define                 MAINHYP   1
+#define                 INHYP     2
+#define                 INCONCL   3
+#define                 MAINCONCL 4
+#define                 INTYPE    5
+#define                 NOTFOUND  6
+
+/****************************************************************/
+/* 3. Types.                                                   */
+/****************************************************************/
+
+struct st_bucket {
+               char                    *id;
+                                               /* identifier */
+               struct st_bucket        *next_st_bucket;
+                                               /* next bucket in the list */
+                struct st_bucket       *all_next;
+                                               /* all buckets in symbol
+                                                  table are linked together */
+                int                     pos[5];
+
+                  };                              
+
+struct st_bucket    *dictionary[DICTSIZE];
+                              /* pointers to bucket lists */
+
+/****************************************************************/
+/* 4. Definitions of functions to be exported.                 */
+/****************************************************************/
+
+struct st_bucket       *all;
+
+ /* The following function initializes the symbol table to NULL */
+void init_symbol_table()
+{
+       struct st_bucket        *st;
+       int                     i;
+
+       /* initialize the dictionary */
+       for (i = 0; i < DICTSIZE; i++)
+               dictionary[i] = NULL;
+        all = NULL;
+}
+
+ /* The following function searches the symbol table for an identifier */
+ /* and inserts it if it is not present. 
+ /* The bucket associated with the given identifier */
+ /* becomes the first one in its list. */
+
+search_bucket(id, where)
+       char            *id;
+                                       /* identifier */
+        int             where;
+{
+       int             dict_index;
+                                       /* value returned by the */
+                                       /* hash function */
+       struct st_bucket
+                       *prev,
+                       *curr;
+
+        struct st_bucket *st;
+
+        /* apply the hash function */
+        dict_index = hash_pjw(id);
+        /* printf( "%d\n", dict_index); */
+        
+        /* scan the bucket list indicated by the hash function */
+        prev = curr = dictionary[dict_index];
+        while ((curr != NULL) && (strcmp(id, curr->id)))
+          {
+            prev = curr;
+            curr = curr->next_st_bucket;
+          }
+       if (curr == NULL)
+          /* the identifier is not in the list */
+          {
+            allocate_bucket(&st,id,where);
+           move_bucket(st,dict_index);
+            return NOTFOUND;
+          }
+       else
+         /* printf("uno=%s\n", id);
+            printf("st=%s\n", curr->id); */
+
+          /* the identifier is already in the list */
+          {
+            /* st = curr; */
+            curr->pos[where] = 1;
+            if (where >= 1) 
+             curr->pos[0] = 0; /* it will never be set again to 1 */
+            if (prev != curr)
+              /* the identifier is not in the first position */
+              {
+                prev->next_st_bucket = curr->next_st_bucket;
+                move_bucket(curr,
+                            dict_index);
+              };
+            return where;
+          }
+}
+
+print_all()
+{
+        int i;
+        struct st_bucket *curr;
+        curr = all;
+
+        while (curr != NULL)
+         {
+            for (i = 0; i < 5; ++i)
+             if (curr->pos[i] == 1)
+               print_one(curr->id,i);
+            curr = curr->all_next;
+          }
+}
+
+
+/****************************************************************/
+/* 5. Definitions of functions local to the module.            */
+/****************************************************************/
+
+print_one(uri,pos)
+     char    *uri;
+     int     pos;
+{
+    printf("<h:refObj>\n");
+    printf("<h:Occurrence rdf:about=\"http://www.cs.unibo.it/helm/schemas/schema-h.rdf#");
+    if (pos == INBODY)
+       printf("InBody");
+    else if (pos == MAINHYP)
+       printf("MainHypothesis");
+    else if (pos == INHYP)
+       printf("InHypothesis");
+    else if (pos == INCONCL)
+       printf("InConclusion");
+    else if (pos == MAINCONCL)
+       printf("MainConclusion");
+    printf("\" rdf:value=\"");
+    printf("%s", uri);
+    printf("\"/>\n");
+    printf("</h:refObj>\n");
+}
+
+ /* The following function allocates a bucket for an identifier. */
+allocate_bucket(st, id, where)
+       struct st_bucket
+                        **st;
+                                       /* pointer to the bucket to be */
+                                       /* allocated */
+       char            *id;
+                                       /* identifier */
+        int             where;
+{
+        int i;
+
+       *st = (struct st_bucket *)malloc(sizeof(struct st_bucket));
+       (*st)->id = (char *)malloc(sizeof('a')*strlen(id));
+        strcpy((*st)->id,id);
+       (*st)->next_st_bucket = NULL;
+        (*st)->all_next = all;
+        all = *st;
+        for (i = 0; i < 5; ++i)
+         (*st)->pos[i] = 0;
+        (*st)->pos[where] = 1;
+}
+
+ /* The following function moves a bucket to the head of the */
+ /* list in which it lies. */
+move_bucket(st, dict_index)
+       struct st_bucket 
+                        *st;
+                                       /* pointer to the bucket to */
+                                       /* be moved */
+       int             dict_index;
+                                       /* index corresponding to */
+                                       /* the list in which the */
+                                       /* bucket lies */
+{
+       st->next_st_bucket = dictionary[dict_index];
+       dictionary[dict_index] = st;
+}
+
+ /* The following function implements Weinberger's hash function. */
+int
+hash_pjw(id)
+       char            *id;
+                                       /* identifier to be hashed */
+{
+       unsigned        h,
+                       g;
+
+       for (h = 0; *id != EOS; id++)
+       {
+               h = (h << HASH1) + (*id);
+               if (g = h & HASH2)
+                       h = h ^ (g >> HASH3) ^ g;
+       }
+       return(h % DICTSIZE);
+}
+
+
+
+
+
diff --git a/helm/metadata/create4/METADATA/sthandler.h b/helm/metadata/create4/METADATA/sthandler.h
new file mode 100644 (file)
index 0000000..d4e17d5
--- /dev/null
@@ -0,0 +1,8 @@
+/****************************************************************/
+/*                          STHANDLER.H                        */
+/****************************************************************/
+
+            
+extern  void            init_symbol_table();
+extern  void            print_all();
+extern  int             search_bucket();
diff --git a/helm/metadata/create4/Makefile b/helm/metadata/create4/Makefile
new file mode 100644 (file)
index 0000000..30f0540
--- /dev/null
@@ -0,0 +1,27 @@
+all:
+       @echo Available targets:
+       @echo "   forward, backward, compress, clean-forward, clean-backward"
+
+forward:
+       time for i in `cat pluto` ; do (cd tmp ; wget -t 1 "http://phd.cs.unibo.it:8081/getxml?format=gz&uri=$$i") ; mkdir -p forward/`dirname $$i | sed "s/cic:\///"` ; zcat tmp/`basename $$i` | METADATA/meta `basename $$i` > forward/`echo $$i | sed "s/cic:\///"` ; rm tmp/`basename $$i` ; done > log 2>&1
+       (cd forward ; ../mkindex.sh forward)
+
+backward:
+       time for i in `cat pluto` ; do touch/touch.opt $$i ; done
+       find forward -type f -exec ./invert.pl {} \;
+       find backward -type f -exec ./fix_rdf.pl {} \;
+       (cd backward ; ../mkindex.sh backward)
+
+compress:
+       find forward -name "*.xml" -exec gzip {} \;
+       find backward -name "*.xml" -exec gzip {} \;
+       (cd forward ; ../mkindex.sh forward)
+       (cd backward ; ../mkindex.sh backward)
+
+clean-forward:
+       rm -rf forward/*
+
+clean-backward:
+       rm -rf backward/*
+
+.PHONY: all forward backward compress clean-forward clean-backward