/* * CDE - Common Desktop Environment * * Copyright (c) 1993-2012, The Open Group. All rights reserved. * * These libraries and programs are free software; you can * redistribute them and/or modify them under the terms of the GNU * Lesser General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) * any later version. * * These libraries and programs are distributed in the hope that * they will be useful, but WITHOUT ANY WARRANTY; without even the * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public * License along with these libraries and programs; if not, write * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth * Floor, Boston, MA 02110-1301 USA */ /* * COMPONENT_NAME: austext * * FUNCTIONS: CNCRD_MEMORY_AREA_LIST * QUERY_STEM_STR * STAT_STR * TREENODE * build_bin_tree * comp_stat * descend_tree * efim_qsort * fill_stem * get_next_memory_block * init_global_memory * init_memory * inv_index_bin_tree * load_ditto_str * release_shm_mem * stat_search * traverse_tree * ve_statistical * * ORIGINS: 27 * * (C) COPYRIGHT International Business Machines Corp. 1993,1995 * All Rights Reserved * US Government Users Restricted Rights - Use, duplication or * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. */ /*************************** VESTATIS.C **************************** * $XConsortium: vestatis.c /main/9 1996/11/25 18:49:04 drk $ * 1993. * Statistically sorted stems search. * * $Log$ * Revision 2.3 1996/02/01 19:35:55 miker * AusText 2.1.11, DtSearch 0.3: Uses new single word parser/stemmers. * * Revision 2.2 1995/10/25 15:00:05 miker * Added prolog. * * Revision 2.1 1995/09/22 22:30:42 miker * Freeze DtSearch 0.1, AusText 2.1.8 * Revision 1.11 1995/09/07 23:30:15 miker * ...One last try (sigh). * Revision 1.10 1995/09/07 19:08:01 miker * Last fix incorrectly coded. * Revision 1.9 1995/09/07 16:25:11 miker * Fixed solaris bus fault caused by TREENODE structure * not being aligned on machines word boundary. Fault occurred * only when query contained more than one word. * Revision 1.8 1995/09/05 19:31:37 miker * Made usrblk and ausapi_msglist global. Replaced Socrates() * with calls to parser() and stemmer(). Deleted socblk. * Numerous name changes. All for DtSearch... */ #ifndef _ALL_SOURCE # define _ALL_SOURCE /* to pickup typedefs for shm vnodes */ #endif #include "SearchE.h" #include #include #include #include #include #include #include #include #include "vista.h" /*-------------------------- GLOBALS ----------------------------*/ /**** declaration for the global memory pointers ****/ #define PROGNAME "VESTATIS" #define MEMORY_SIZE 64000 /* 65536 is 64 KBytes of memory */ #define REC_TYPES 256 #define NORM_VALUE 30 #undef INFINITY /* XXX does GCC's __builtin_inff() work here? */ #define INFINITY 9999.0 #define SORT_MESG 10000 #define CHAR_BITS 8 #define STACKSZ 256 #define MED_3_VALUE 7 #define TIME_ITERATION 1 #define LOG2 0.693147181 #define MS_vestatis 17 #define STRUCT_ALIGN sizeof(char*) static int SHM_FLAG = IPC_CREAT | S_IRUSR | S_IWUSR | S_IWGRP | S_IRGRP | S_IROTH | S_IWOTH; static char *mem_start; static char *cur_pos; static long mem_offset; static long total_memory_size; typedef struct q_s { char stem[DtSrMAXWIDTH_HWORD]; int count; } QUERY_STEM_STR; typedef struct mem_area { char *start_of_mem_block; long block_size; struct mem_area *next_block; } CNCRD_MEMORY_AREA_LIST; typedef struct bintree { struct bintree *rlink; /* ptr to next node in linked list or * right link in binary tree */ struct bintree *llink; /* left link in binary tree */ char *word; /* ptr to word in the query */ int count; } TREENODE; typedef struct s_a { DB_ADDR dba; float wght; DtSrINT32 num_word_hits; } STAT_STR; static STAT_STR *stat_array = NULL; static TREENODE *root_node; static TREENODE *top_of_stack; static TREENODE *stack; static TREENODE *pres; static TREENODE *prev; static TREENODE *next; static TREENODE *avail_node; static CNCRD_MEMORY_AREA_LIST *memory_blocks = NULL; static CNCRD_MEMORY_AREA_LIST *cur_mem_ptr; static QUERY_STEM_STR *query_stems = NULL; static DB_ADDR *word_addrs = NULL; static int num_diff_words = 0; static char begin_search; static char begin_sort; static char begin_load_ditto; static char begin_qsort; static char qsort_done; static DtSrINT32 real_num_rec; static DtSrINT32 num_hits; static DtSrINT32 total_num_addrs; static DtSrINT32 dba_offset; static unsigned char rec_type_tab[REC_TYPES]; static char vestat_msgbuf[256]; static int mes_search_box; static int slot_d00; extern char *chmat (); extern void find_keyword (char *cur_word, int vista_num); extern void read_wordstr (struct or_hwordrec * glob_word, int vista_num); extern void write_wordstr (struct or_hwordrec * glob_word, int vista_num); static void stat_search (void); /* redefined below */ /********************************/ /* */ /* Release Shared Memory */ /* */ /********************************/ void release_shm_mem (void) { if (global_memory_ptr != NULL) { if (shmdt (global_memory_ptr) == -1) { DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 104, PROGNAME "104 Cannot detach shared memory ")); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; return; } if (shmctl (shm_id, IPC_RMID, NULL) == -1) { DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 110, PROGNAME "110 Cannot remove shared memory ")); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; return; } global_memory_ptr = NULL; } return; } /* release_shm_mem() */ /********************************/ /* */ /* Init Global Memory */ /* */ /********************************/ /* addrs - largest DBA slot in D00 file in the current database * r_addrs - total records count in the current database. */ static int init_global_memory (DtSrINT32 addrs, DtSrINT32 r_addrs) { long i, j; size_t k; i = DtSrMAX_STEMCOUNT * ((addrs >> 3) + 1) * 2 + addrs * sizeof (int) + sizeof (DB_ADDR) * r_addrs; j = sizeof (STAT_STR) * addrs + sizeof (DB_ADDR) * r_addrs; k = (i > j) ? i : j; shm_id = shmget (IPC_PRIVATE, k, SHM_FLAG); if ((global_memory_ptr = (char *) shmat (shm_id, (char *) 0, 0)) == ((char *) -1)) { DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 115, PROGNAME "115 No shared memory available")); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; return FALSE; } return TRUE; } /* init_global_memory() */ /****************************************/ /* */ /* efim_qsort */ /* */ /****************************************/ /* Custom quick sort algorithm (medium-of-3 partitioning). * Coded for efficiency given our expected data characteristics, * and for interruptability. */ int efim_qsort (void) { time_t start_time; double time_dif; static long left, right; static long scan_l, scan_r, mid3, pvidx, l_size, r_size; static long sptr; static float pivot, temp, stack_l[STACKSZ], stack_r[STACKSZ]; static DB_ADDR dba; /* Test whether user has pushed STOP button since last call. */ if (usrblk.flags & USR_STOPSRCH) { if (OE_flags & OE_AUDIT) oe_write_audit_rec (-1L); usrblk.retncode = OE_USER_STOP; release_shm_mem (); return TRUE; } if (begin_qsort) { sptr = 0; left = 0; right = num_hits - 1; begin_qsort = FALSE; } time (&start_time); for (;;) { /* check iteration loop */ time_dif = difftime (time (NULL), start_time); if ((time_dif > TIME_ITERATION || usrblk.debug & USRDBG_ITERATE) && !(usrblk.flags & USR_NO_ITERATE)) { usrblk.retncode = OE_SEARCHING; usrblk.workproc = stat_search; mes_search_box = TRUE; return TRUE; } while (right > left) { if ((right - left) > MED_3_VALUE) { /* * compute value for the median-of-three partitioning */ mid3 = (left + right) >> 1; /* * three-sort left, middle, and right elements */ if ((stat_array + left)->wght < (stat_array + mid3)->wght) { temp = (stat_array + left)->wght; (stat_array + left)->wght = (stat_array + mid3)->wght; (stat_array + mid3)->wght = temp; dba = (stat_array + left)->dba; (stat_array + left)->dba = (stat_array + mid3)->dba; (stat_array + mid3)->dba = dba; } if ((stat_array + left)->wght < (stat_array + right)->wght) { temp = (stat_array + left)->wght; (stat_array + left)->wght = (stat_array + right)->wght; (stat_array + right)->wght = temp; dba = (stat_array + left)->dba; (stat_array + left)->dba = (stat_array + right)->dba; (stat_array + right)->dba = dba; } if ((stat_array + mid3)->wght < (stat_array + right)->wght) { temp = (stat_array + mid3)->wght; (stat_array + mid3)->wght = (stat_array + right)->wght; (stat_array + right)->wght = temp; dba = (stat_array + mid3)->dba; (stat_array + mid3)->dba = (stat_array + right)->dba; (stat_array + right)->dba = dba; } /* select pivot element index */ pvidx = right - 1; /* exchange pivot with the middle element */ temp = (stat_array + mid3)->wght; (stat_array + mid3)->wght = (stat_array + pvidx)->wght; (stat_array + pvidx)->wght = temp; dba = (stat_array + mid3)->dba; (stat_array + mid3)->dba = (stat_array + pvidx)->dba; (stat_array + pvidx)->dba = dba; /* setup for partitioning */ scan_l = left + 1; scan_r = right - 2; } else { /* select pivot element index */ pvidx = right; /* set scanning indexes */ scan_l = left; scan_r = right - 1; } /* select pivot element */ pivot = (stat_array + pvidx)->wght; for (;;) { /* scan from left */ while ((stat_array + scan_l)->wght > pivot) { scan_l++; } /* scan from right */ while ((stat_array + scan_r)->wght < pivot) { if (scan_r == 0) { break; } scan_r--; } /* if scan have met, exit inner loop */ if (scan_l >= scan_r) { break; } /* exchange elements */ temp = (stat_array + scan_r)->wght; (stat_array + scan_r)->wght = (stat_array + scan_l)->wght; (stat_array + scan_l)->wght = temp; dba = (stat_array + scan_r)->dba; (stat_array + scan_r)->dba = (stat_array + scan_l)->dba; (stat_array + scan_l)->dba = dba; /* move scans to next elements */ scan_l++; scan_r--; } if (scan_l != pvidx) { /* exchange finale element */ temp = (stat_array + pvidx)->wght; (stat_array + pvidx)->wght = (stat_array + scan_l)->wght; (stat_array + scan_l)->wght = temp; dba = (stat_array + pvidx)->dba; (stat_array + pvidx)->dba = (stat_array + scan_l)->dba; (stat_array + scan_l)->dba = dba; } /* calculate section sizes */ l_size = scan_l - left; r_size = right - scan_l; /* place largest section on stack */ if (l_size > r_size) { /* ignore 1-element sections */ if (l_size > 1) { sptr++; if (sptr == STACKSZ) { fputs (catgets (dtsearch_catd, MS_vestatis, 107, PROGNAME "107 Qsort stack overflow.\n"), aa_stderr); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return FALSE; } *(stack_l + sptr) = left; *(stack_r + sptr) = scan_l - 1; } /* ignore 1-element sections */ if (r_size != 0) { left = scan_l + 1; } else { break; } } else { /* ignore 1-element sections */ if (r_size > 1) { sptr++; if (sptr == STACKSZ) { fputs (catgets (dtsearch_catd, MS_vestatis, 107, PROGNAME "107 Qsort stack overflow.\n"), aa_stderr); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return FALSE; } *(stack_l + sptr) = scan_l + 1; *(stack_r + sptr) = right; } /* ignore 1-element sections */ if (l_size != 0) { right = scan_l - 1; } else { break; } } } /* iterate with values from stack (if any) */ if (sptr) { left = *(stack_l + sptr); right = *(stack_r + sptr); sptr--; } else { break; } } qsort_done = TRUE; return TRUE; } /* efim_qsort() */ /****************************************/ /* */ /* fill_stem */ /* */ /****************************************/ /* "Visit" subroutine of descend_tree(), which is itself subroutine * of traverse_tree(). Builds query_stems array * and establishes its size in num_diff_words. */ static void fill_stem (TREENODE * cur_stem) { query_stems[num_diff_words].count = cur_stem->count; strcpy (query_stems[num_diff_words].stem, cur_stem->word); num_diff_words++; return; } /* fill_stem() */ /****************************************/ /* */ /* descend_tree */ /* */ /****************************************/ /* Subroutine of traverse_tree(), Robson tree traversal algorithm. */ static void descend_tree (void) { int not_done = TRUE; while (not_done) { /* end of 'descent' subalgorithm? */ if ((pres->llink == NULL) && (pres->rlink == NULL)) { /* Preorder, Symmetric Order and Postorder */ fill_stem (pres); avail_node = pres; return; } if (pres->llink != NULL) { /* Preorder */ fill_stem (pres); next = pres->llink; pres->llink = prev; prev = pres; pres = next; } else { /* Preorder and Symmetric Order */ fill_stem (pres); next = pres->rlink; pres->rlink = prev; prev = pres; pres = next; } } return; } /* descend_tree() */ /********************************/ /* */ /* traverse_tree */ /* */ /********************************/ /* The algorithm is based on the J. M. ROBSON link inversion traversal * algorithm for binary trees. Ref. Thomas A. STANDISH pp. 77-78. */ static void traverse_tree (void) { int not_done = TRUE; int descend = TRUE; /* initialize the variables */ pres = root_node; prev = pres; top_of_stack = NULL; stack = NULL; while (not_done) { if (descend) { descend_tree (); } if (pres == root_node) { return; } if (prev->rlink == NULL) { /* Symmetric Order and Postorder */ /*** fill_stem(prev); ***/ next = prev->llink; prev->llink = pres; pres = prev; prev = next; descend = FALSE; } else { if (prev->llink == NULL) { /* Postorder */ /** fill_stem(prev); **/ next = prev->rlink; prev->rlink = pres; pres = prev; prev = next; descend = FALSE; } else { if (prev == top_of_stack) { /* Postorder */ /** fill_stem(prev); **/ next = stack; top_of_stack = stack->rlink; stack = stack->llink; next->llink = NULL; next->rlink = NULL; next = prev->llink; prev->llink = prev->rlink; prev->rlink = pres; pres = prev; prev = next; descend = FALSE; } else { /* Symmetric Order */ /*** fill_stem(prev); ***/ avail_node->llink = stack; avail_node->rlink = top_of_stack; stack = avail_node; top_of_stack = prev; next = prev->rlink; prev->rlink = pres; pres = next; descend = TRUE; } } } } } /* traverse_tree() */ /********************************/ /* */ /* Get Next Memory Block */ /* */ /********************************/ void get_next_memory_block (size_t node_size) { CNCRD_MEMORY_AREA_LIST *temp_ptr; temp_ptr = memory_blocks; /* * We run out of pre-allocated memory. Allocate additional block of * memory */ if (cur_mem_ptr == NULL) { total_memory_size += node_size; mem_start = (char *) malloc (total_memory_size); mem_offset = 0L; mem_offset += node_size; cur_pos = mem_start; if (mem_start == NULL) { fprintf (aa_stderr, catgets (dtsearch_catd, MS_vestatis, 310, "%s Out of Memory. Need %ld bytes.\n"), PROGNAME "310", total_memory_size); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return; } /* * allocate space for the next member of the memory blocks link * list */ memory_blocks = (CNCRD_MEMORY_AREA_LIST *) malloc (sizeof (CNCRD_MEMORY_AREA_LIST) + 2); if (memory_blocks == NULL) { fputs (catgets (dtsearch_catd, MS_vestatis, 314, PROGNAME"314 Out of Memory.\n"), aa_stderr); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return; } memory_blocks->start_of_mem_block = mem_start; memory_blocks->next_block = temp_ptr; memory_blocks->block_size = total_memory_size; /**** allocation of initial memory blocks is done ****/ } /* Use next available block of memory */ else { mem_start = cur_mem_ptr->start_of_mem_block; total_memory_size = cur_mem_ptr->block_size; cur_mem_ptr = cur_mem_ptr->next_block; mem_offset = 0L; mem_offset += node_size; cur_pos = mem_start; } return; } /* get_next_memory_block() */ /********************************/ /* */ /* build_bin_tree */ /* */ /********************************/ /* Subroutine of inv_index_bin_tree(). * Called for each stem in query. * Inserts new stem (already uppercase) into tree * or increments existing stem's count. * Returns TRUE and incr num_diff_words if new stem inserted. * Returns FALSE if existing stem's count merely incremented. * Returns FALSE and OE_ABORT set on error. */ static int build_bin_tree (char *cur_word) { int i; int wordlen; size_t treenode_size; TREENODE *new; TREENODE **this_link; wordlen = strlen (cur_word); /* Determine the amount of memory needed for the * new node. Add in a pad amount to align it * on the machine's word (integer) boundary. * Some machines aren't happy about misaligned * structures and we're emulating our own malloc. * (Thanks, and a tip o' the hat to Takuki Kamiya). */ treenode_size = sizeof (TREENODE) + wordlen + 2; treenode_size += (STRUCT_ALIGN - treenode_size % STRUCT_ALIGN) % STRUCT_ALIGN; /* allocate a new node and load its data fields */ mem_offset += treenode_size; if (mem_offset > total_memory_size) { /* allocate new chunk of memory */ get_next_memory_block (treenode_size); if (usrblk.retncode == OE_ABORT) return FALSE; } new = (TREENODE *) cur_pos; cur_pos = mem_start + mem_offset; new->llink = NULL; new->rlink = NULL; new->word = (char *) new + sizeof (TREENODE); new->count = 1; strcpy (new->word, cur_word); /* Insert current word into binary tree */ for (this_link = &root_node; *this_link != NULL;) { i = strcmp (new->word, (*this_link)->word); /* Test for current word already in the binary tree */ if (i == 0) { mem_offset -= treenode_size; cur_pos = mem_start + mem_offset; (*this_link)->count++; return FALSE; /* no point in continuing descent */ } /* Descend tree to find correct insertion point */ this_link = (i < 0) ? &(*this_link)->llink : &(*this_link)->rlink; } /* end for loop to find tree insertion * point */ /* Insert new node at current location in tree */ *this_link = new; num_diff_words++; return TRUE; } /* build_bin_tree() */ /************************/ /* */ /* init_memory */ /* */ /************************/ /* Initialize the first block of memory for the binary tree. * This function is called only once at each run of the offline program. */ void init_memory (void) { mem_start = (char *) malloc (MEMORY_SIZE); if (mem_start == NULL) { fprintf (aa_stderr, catgets (dtsearch_catd, MS_vestatis, 310, "%s Out of Memory. Need %ld bytes.\n"), PROGNAME "310", MEMORY_SIZE); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return; } total_memory_size = MEMORY_SIZE; cur_pos = mem_start; mem_offset = 0L; /* * Allocate space for the first member of the memory blocks link list */ memory_blocks = (CNCRD_MEMORY_AREA_LIST *) malloc (sizeof (CNCRD_MEMORY_AREA_LIST) + 2); if (memory_blocks == NULL) { fputs (catgets (dtsearch_catd, MS_vestatis, 314, PROGNAME "314 Out of Memory.\n"), aa_stderr); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return; } memory_blocks->start_of_mem_block = mem_start; memory_blocks->block_size = total_memory_size; memory_blocks->next_block = NULL; cur_mem_ptr = NULL; return; } /* init_memory() */ /********************************/ /* */ /* inv_index_bin_tree */ /* */ /********************************/ /* Builds binary tree of all stems in query. * Returns TRUE and loads num_diff_words with number * of nodes in tree if tree successfully built, * or if query is empty. * Returns FALSE on any error (causing eventual engine abort). */ static int inv_index_bin_tree (void) { char *cptr; DBLK *dblk = usrblk.dblk; PARG parg; /* First time initialize the first block of memory */ if (memory_blocks == NULL) { /** INITIALIZE MEMORY **/ init_memory (); if (usrblk.retncode == OE_ABORT) return FALSE; root_node = NULL; } /* WORD LOOP. Parse and stem each word in query. * Add each stem to bin tree or incr its count. */ memset (&parg, 0, sizeof(PARG)); parg.dblk = dblk; parg.string = usrblk.query; for ( cptr = dblk->parser (&parg); cptr; cptr = dblk->parser (NULL)) { build_bin_tree (dblk->stemmer (cptr, dblk)); if (usrblk.retncode == OE_ABORT) return FALSE; } return TRUE; } /* inv_index_bin_tree() */ /************************/ /* */ /* comp_stat */ /* */ /************************/ int comp_stat (void *val1, void *val2) { STAT_STR *bkt1; STAT_STR *bkt2; bkt1 = (STAT_STR *) val1; bkt2 = (STAT_STR *) val2; if ((bkt2->wght) > (bkt1->wght)) { return 1; } else { return -1; } } /* comp_stat() */ /************************************************/ /* */ /* load_ditto_str */ /* */ /************************************************/ /* Last function called from statistical search. * Builds a real AusText hitlist from the sorted stat_array, * translating the statistical weights to AusText 'proximity' * values, and truncating the hitlist at user's maxhits. * Working variables made static for speeeeeeeed. */ void load_ditto_str (void) { struct or_objrec cur_rec; /* structure taken from austext.h */ struct or_miscrec rec_data; static time_t start_time; static double time_dif; static DB_ADDR dba1; static DtSrResult *cur_ditto_mem; static DtSrResult *ditto_llist; static DtSrResult *temp_ditto; static int debugging; static int m; static DtSrINT32 d0024; static DtSrINT32 maxhits; static DtSrINT32 i32, i32_start, j32; static int fzkeysz, fzkey_remaining, abstrsz, dittosz; static char *src, *targ, *targend; static int check_dates = FALSE; static double sum = 0.0; static double sum1, sum2, sum3, sum4; debugging = (usrblk.debug & USRDBG_SRCHCMPL); maxhits = usrblk.dblk->maxhits; fzkeysz = usrblk.dblk->dbrec.or_fzkeysz; abstrsz = usrblk.dblk->dbrec.or_abstrsz; dittosz = sizeof (DtSrResult) + abstrsz + 16; if (debugging) fprintf (aa_stderr, PROGNAME "773 " "numhits=%ld maxhits=%d numwords=%d abstrsz=%d\n", (long)num_hits, (int)maxhits, num_diff_words, abstrsz); if (begin_load_ditto) { /* test for zero hits */ if (num_hits == 0) { usrblk.workproc = dummy_workproc; usrblk.retncode = OE_NOTAVAIL; if (OE_flags & OE_AUDIT) oe_write_audit_rec (0L); release_shm_mem (); return; } check_dates = (usrblk.objdate1 || usrblk.objdate2); /* In order to translate statistical weight into an AusText * proximity, basically you have to invert it, then scale it. * The statistical weight is a similarity measure: the * larger it is the more similar the document to the query. * But AusText 'proximity' is like a 'distance' measure, * the smaller the number the closer the document is to the query. * * First 'normalize' each document's statistical * weight to be a fraction between 0 and 1. Do this * by calculating a normalization factor (sum1), the * sqrt of the sum of squares of first NORM_VALUE weights. * (Trying to make the inversion scheme produce * reasonable proximity numbers for these first records). * * To complete proximity initialization, he uses * the sum1 factor to determine and keep the first record's * normalized weight (sum), presumably a fraction close * to 1.0, and the first record's proximity (sum2), * basically the percent * value that the first doc is 'distant' from perfection (1.0 or 100%). * For example, if the normalized weight of the first record is .931 * then the proximity will be 7 (100% - 93% = 7%). He does this now * because he's going to use this first proximity (sum2) as a scaling * factor to stretch out all the subsequent proximities so they * look reasonable. */ sum = 0.0; for (i32 = 0; i32 < num_hits; i32++) { sum1 = (double) (stat_array + i32)->wght / (double) num_diff_words; sum += sum1 * sum1; if (i32 >= NORM_VALUE) break; } /* * sum1 = normalization factor. * sum = normalized weight (betw 0 and 1) of first record. * sum2 = proximity of first record, proximity scale factor. */ sum1 = sqrt (sum); sum = ((stat_array + 0)->wght / num_diff_words) / sum1; sum2 = (1.0 - sum) * 100.0; if (debugging) fprintf (aa_stderr, PROGNAME "844 " "normfac=%.2lf normwt(#1)=%.2lf prox(#1)=%.2lf\n", sum1, sum, sum2); /* Preallocate first hit on ditto_list */ ditto_llist = (DtSrResult *) austext_malloc (dittosz, PROGNAME "449", NULL); j32 = 0; i32_start = 0; d0024 = OR_D00 << 24; begin_load_ditto = FALSE; } /* endif (begin_load_ditto) */ /* Test whether user has pushed STOP button since last call */ if (usrblk.flags & USR_STOPSRCH) { if (OE_flags & OE_AUDIT) oe_write_audit_rec (-1L); usrblk.retncode = OE_USER_STOP; release_shm_mem (); if (j32 == 0) free (ditto_llist); else free_llist ((LLIST **) &ditto_llist); return; } time (&start_time); /**** MAIN DtSrResult LIST BUILD LOOP ****/ for (i32 = i32_start; i32 < num_hits; i32++) { /* check iteration loop */ time_dif = difftime (time (NULL), start_time); if ((time_dif > TIME_ITERATION || usrblk.debug & USRDBG_ITERATE) && !(usrblk.flags & USR_NO_ITERATE)) { i32_start = i32; usrblk.retncode = OE_SEARCHING; usrblk.workproc = load_ditto_str; mes_search_box = TRUE; return; } dba1 = ((stat_array + i32)->dba * slot_d00 - dba_offset) | d0024; /* * Don't use CRSET or RECREAD macros here so we can trap invalid * dba errs. */ d_crset (&dba1, saveusr.vistano); if (db_status < 0) { fprintf (aa_stderr, catgets (dtsearch_catd, MS_vestatis, 437, "%s: db_status = %d, dba = %d:%ld (x'%08.8lx'), vistano = %d\n"), PROGNAME "437", db_status, (dba1 & 0xff000000) >> 24, dba1 & 0xffffff, dba1, saveusr.vistano); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return; } d_recread (&cur_rec, saveusr.vistano); if (db_status < 0) { fprintf (aa_stderr, catgets (dtsearch_catd, MS_vestatis, 437, "%s: db_status = %d, dba = %d:%ld (x'%08.8lx'), vistano = %d\n"), PROGNAME "437", db_status, (dba1 & 0xff000000) >> 24, dba1 & 0xffffff, dba1, saveusr.vistano); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return; } swab_objrec (&cur_rec, NTOH); /* Skip any record with undesired keytype * char, ie first char of key. */ if (*(rec_type_tab + cur_rec.or_objkey[0]) == 0) continue; /* Skip record if out of date range. */ if (check_dates) if (!objdate_in_range (cur_rec.or_objdate, usrblk.objdate1, usrblk.objdate2)) continue; if (j32 == 0) /* first ditto node already allocated */ cur_ditto_mem = ditto_llist; else { cur_ditto_mem = malloc (dittosz); if (cur_ditto_mem == NULL) { fputs ( catgets (dtsearch_catd, MS_vestatis, 504, PROGNAME "504 Cannot allocate cur_ditto\n"), aa_stderr); OE_flags |= OE_PERMERR; usrblk.retncode = OE_ABORT; release_shm_mem (); return; } temp_ditto->link = cur_ditto_mem; } /* Load the ditto_list for this dba */ memset (cur_ditto_mem, 0, sizeof(DtSrResult)); cur_ditto_mem->dbn = OE_dbn; cur_ditto_mem->dba = dba1; strcpy (cur_ditto_mem->reckey, cur_rec.or_objkey); cur_ditto_mem->objsize = cur_rec.or_objsize; cur_ditto_mem->objdate = cur_rec.or_objdate; cur_ditto_mem->objflags = cur_rec.or_objflags; cur_ditto_mem->objuflags = cur_rec.or_objuflags; cur_ditto_mem->objtype = cur_rec.or_objtype; cur_ditto_mem->objcost = cur_rec.or_objcost; /*****cur_ditto_mem->flags = 0;****/ cur_ditto_mem->abstractp = (char *) cur_ditto_mem + sizeof (DtSrResult); cur_ditto_mem->abstractp[0] = 0; /* Translate statistical weight into AusText proximity. * sum3 = normalized weight (betw 0 and 1). * sum4 = prox = ratio of this normalized weight to * first rec's weight, scaled by the first rec's proximity. * No proximity is allowed to exceed some very large number. */ sum3 = ((stat_array + i32)->wght / num_diff_words) / sum1; sum4 = sum2 * (sum / sum3); if (sum4 > INFINITY) sum4 = INFINITY; cur_ditto_mem->proximity = sum4; if (debugging) fprintf (aa_stderr, " --> dba=%ld normwt=%.4lf prox=%d key='%s'\n", dba1, sum3, cur_ditto_mem->proximity, cur_ditto_mem->reckey); /* * The abstract immediately follows the fuzzy key in the FZKABS * misc recs. It may span several recs. */ if (abstrsz > 0) { targ = cur_ditto_mem->abstractp; targend = targ + abstrsz - 1; fzkey_remaining = fzkeysz; SETOR (PROGNAME "2270", OR_OBJ_MISCS, saveusr.vistano); FINDFM (PROGNAME "2271", OR_OBJ_MISCS, saveusr.vistano); while (db_status == S_OKAY) { RECREAD (PROGNAME "549", &rec_data, saveusr.vistano); NTOHS (rec_data.or_misctype); if (rec_data.or_misctype == ORM_FZKABS) { src = (char *) rec_data.or_misc; for (m = 0; m < sizeof(rec_data.or_misc); m++) { if (fzkey_remaining > 0) { src++; fzkey_remaining--; continue; /* inner for-loop on m */ } *targ = *src; if (*src++ == 0 || targ++ >= targend) { *targ = 0; targ = targend; /* make outer loop end */ break; } } /* end for-loop for curr misc rec */ } /* endif: misctype == FZKABS */ if (targ >= targend) break; FINDNM (PROGNAME "545", OR_OBJ_MISCS, saveusr.vistano); } /* end while-loop */ } /* endif: (abstrsz > 0) */ cur_ditto_mem->link = NULL; temp_ditto = cur_ditto_mem; /* Increment to next hit. * Break loop when we reach user's specified maxhits. */ j32++; /* [j32 same as i] !? */ if (j32 >= maxhits) break; } /* i32-loop on each hit in ditto list */ if (j32 == 0) { usrblk.workproc = dummy_workproc; usrblk.retncode = OE_NOTAVAIL; if (OE_flags & OE_AUDIT) oe_write_audit_rec (0L); release_shm_mem (); return; } if (num_hits >= maxhits) { if (!(usrblk.flags & USR_NO_INFOMSGS)) { sprintf (vestat_msgbuf, catgets (dtsearch_catd, MS_vestatis, 421, "$s Total Number Hits = %ld. Discarded hits beyond maximum number specified."), PROGNAME "421", (long)num_hits); DtSearchAddMessage (vestat_msgbuf); } } free_llist ((LLIST **) &usrblk.dittolist); usrblk.dittolist = ditto_llist; usrblk.dittocount = j32; usrblk.workproc = dummy_workproc; usrblk.retncode = OE_OK; if (OE_flags & OE_AUDIT) oe_write_audit_rec ((long) num_hits); /***** Free shared memory *****/ release_shm_mem (); return; } /* load_ditto_str() */ /****************************************/ /* */ /* stat_search */ /* */ /****************************************/ /* Subroutine of ve_statistical() and interruptable workproc. */ static void stat_search (void) { time_t start_time; double time_dif; DB_ADDR temp, temp1; struct or_hwordrec word1; /* structure taken from austext.h */ double idf, cur_weight; int qs; DtSrINT32 int32, j32; /*****@@@ size_t size;****/ static int qs_start; /* Test whether user has pushed STOP button since last call */ if (usrblk.flags & USR_STOPSRCH) { if (OE_flags & OE_AUDIT) oe_write_audit_rec (-1L); usrblk.retncode = OE_USER_STOP; release_shm_mem (); return; } if (begin_sort) { begin_qsort = TRUE; qsort_done = FALSE; if (begin_search) { qs_start = 0; begin_search = FALSE; } time (&start_time); /* * For every query stem, read d99. For every dba in d99 for each * stem, update object's stat array node with rec count and a * weight based on the IDF for this stem. (IDF is described * below). Saveusr.stemcount = lesser of DtSrMAX_STEMCOUNT or * num_diff_words. All stems are stored in d99 beginning with ^O * (decimal 15). Index qs = curr query stem */ for (qs = qs_start; qs < saveusr.stemcount; qs++) { word1.or_hwordkey[0] = 15; word1.or_hwordkey[1] = '\0'; strcat (word1.or_hwordkey, query_stems[qs].stem); find_keyword (word1.or_hwordkey, saveusr.vistano); /* * If word is not in the database, ignore it. [ If word * not in database, why not take the next stem in query_stems * array, if any? ] */ if (db_status != S_OKAY) word1.or_hwaddrs = 0; else read_wordstr (&word1, saveusr.vistano); if (word1.or_hwaddrs > 0) { fseek (usrblk.dblk->iifile, word1.or_hwoffset, SEEK_SET); /****@@@size = sizeof (DB_ADDR) * word1.or_hwaddrs;***/ fread (word_addrs, sizeof(DB_ADDR), (size_t)word1.or_hwaddrs, usrblk.dblk->iifile); /* * Calculate IDF (inverse document frequency) for this * word. The IDF is a statistical ratio of the number * of documents containing the word and the total * number of documents in the entire corpus. * It is calculated here on the fly to save space in the * database. IDF = {log (totnumdocs / numdocswithword) / * log(2)} + 1. Note that an IDF of 1 means the word * occurs in every doc (it's meaningless). An IDF of 19 * means the word occurs once in every 300,000 recs. * Note that by dividing by log(2) the IDF also tells * us how many binary digits are necessary to discriminate * the word. Finally I think 1.0 was added to prevent * it ever becoming zero when converted to integer. */ idf = (log ((double) real_num_rec / (double) word1.or_hwaddrs) / LOG2) + 1.0; /* * WEIGHT PASS #1: * Update the stat array node for each doc (ie dba) which * includes this stem. Specifically, * sum the product of the IDF and word-doc weight into * the 'wght' bucket, and update the number of query * words this doc contains. Note that the d99 dba format * is slot# in hi 3 bytes, word-doc weights in lo byte. */ for (j32 = 0; j32 < word1.or_hwaddrs; j32++) { NTOHL (word_addrs [j32]); temp1 = *(word_addrs + j32); /* d99 dba */ cur_weight = (double) (temp1 & 0xFF); /* lo byte */ temp = temp1 >> 8; /* slot# */ ((stat_array + temp)->num_word_hits)++; ((stat_array + temp)->dba) = temp; ((stat_array + temp)->wght) += (float) (cur_weight * idf); } } /* end if (word1.or_hwaddrs > 0), ie * query word exists */ /* * If the query words were common, the last double loop may * have taken a long time. If so, return now to the user * interface to allow the gui to respond to button clicks * (like CANCEL buttons). */ time_dif = difftime (time (NULL), start_time); if ((time_dif > TIME_ITERATION || usrblk.debug & USRDBG_ITERATE) && !(usrblk.flags & USR_NO_ITERATE)) { if (qs == saveusr.stemcount - 1) { usrblk.retncode = OE_SEARCHING; usrblk.workproc = stat_search; mes_search_box = TRUE; return; } else { qs_start = qs + 1; usrblk.retncode = OE_SEARCHING; usrblk.workproc = stat_search; mes_search_box = TRUE; return; } } /* end if (time_dif > TIME_ITERATION */ } /* end qs-loop on each query stem */ /* * Entire stat array contains one node for every possible dba * (doc). Collapse the records that were actually referenced by * the query words into the top portion of the array. * Set 'num_hits' to the collapsed stat array size, ie * num_hits = the total number of docs that will be on * the prelim hitlist, prior to sort and truncation to user's maxhits. * * WEIGHT PASS #2: * While we're at it, finalize the accumulated 'wght' field, which * will be our sort field, by multiplying it by the ratio of the * number of query words in the document divided by the number of * words in the query. * Thus the final sort field for each doc is the sum * over all the query words in the doc of 3 factors: * 1) IDF (relative weight of each query word in corpus), times * 2) d99wght (relative weight of each query word in doc), times * 3) weight based on number of different query words in this doc. */ num_hits = 0; for (int32 = 0; int32 < total_num_addrs; int32++) { if (stat_array[int32].wght > 0) { (stat_array + num_hits)->num_word_hits = (stat_array + int32)->num_word_hits; (stat_array + num_hits)->wght = (stat_array + int32)->wght * ((double) (stat_array + int32)->num_word_hits / (double) num_diff_words); (stat_array + num_hits)->dba = (stat_array + int32)->dba; num_hits++; } } /* * We're about to sort the actual hits. If the number of them * exceeds a certain threshold, return to the user interface one * more time to again allow the gui to respond to user CANCEL * events. */ if (num_hits > SORT_MESG && !(usrblk.flags & USR_NO_ITERATE)) { if (!mes_search_box) { DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 990, PROGNAME"990 The system is now sorting. Please wait.")); } usrblk.retncode = OE_SEARCHING; usrblk.workproc = stat_search; mes_search_box = TRUE; begin_sort = FALSE; return; } } /* end if (begin_sort) */ /* Sort the preliminary hitlist (stat_array) * by the calculated statistical weights. */ if (!efim_qsort ()) return; /* Build a real AusText hitlist from the sorted stat_array, * translating the statistical weights to AusText 'proximity' * values, and truncating the hitlist at user's maxhits. */ if (qsort_done) { begin_load_ditto = TRUE; load_ditto_str (); } return; } /* stat_search() */ /****************************************/ /* */ /* ve_statistical */ /* */ /****************************************/ void ve_statistical (void) { void stat_search (void); DB_ADDR dba; int i, j; DtSrINT32 int32; mes_search_box = FALSE; usrblk.flags &= ~USR_STOPSRCH; /* turn off stop button */ usrblk.retncode = OE_OK; usrblk = usrblk; saveusr.vistano = usrblk.dblk->vistano; saveusr.dittolist = NULL; saveusr.dittocount = 0L; saveusr.iterations = 1; /****** find total number of records in the database *********/ RECFRST (PROGNAME "1067", OR_OBJREC, saveusr.vistano); CRGET (PROGNAME "1068", &dba, saveusr.vistano); real_num_rec = usrblk.dblk->dbrec.or_reccount; slot_d00 = usrblk.dblk->dbrec.or_recslots; dba_offset = slot_d00 - (dba & 0x00FFFFFF); total_num_addrs = (usrblk.dblk->dbrec.or_maxdba - (dba & 0x00FFFFFF) + 1) / slot_d00 + 1; /* stat_array size = 1 node for every possible object */ if (usrblk.query[0] == 0) { DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 677, PROGNAME "677 Query field is empty.")); usrblk.retncode = OE_BAD_QUERY; return; } /* * Build binary tree of each stem in query containing count of number * of occurrences of stem in query. Loads num_diff_words with number * of nodes in tree. */ num_diff_words = 0; inv_index_bin_tree(); if (usrblk.retncode == OE_ABORT) return; if (num_diff_words < 1) { usrblk.retncode = OE_NOTAVAIL; return; } /***** allocate memory for query_stems array *********/ if (query_stems != NULL) { free (query_stems); query_stems = NULL; } query_stems = (QUERY_STEM_STR *) austext_malloc (sizeof (QUERY_STEM_STR) * (num_diff_words + 1), PROGNAME " 371", NULL); /* * Traverse tree to build query_stems array, each array node = tree * node, ie each unique stem in query and its count in query. * Num_diff_words now used as index for growing array. */ num_diff_words = 0; traverse_tree (); /* * For each new query initialize memory offset, current memory start * position, and total size for the available memory. Starts from the * first member in the link list of memory blocks. */ root_node = NULL; mem_start = memory_blocks->start_of_mem_block; total_memory_size = memory_blocks->block_size; cur_mem_ptr = memory_blocks->next_block; cur_pos = mem_start; mem_offset = 0L; /* * Copy first DtSrMAX_STEMCOUNT stems into the saveusr.stems. [So no more * than DtSrMAX_STEMCOUNT will be used in search or hiliting!] */ for (i = 0; i < num_diff_words; i++) { if (i == DtSrMAX_STEMCOUNT) break; strcpy (usrblk.stems[i], query_stems[i].stem); } usrblk.stemcount = i; saveusr.stemcount = i; /* Prepare a string holding first char of desired record ids */ for (i = 0; i < REC_TYPES; i++) *(rec_type_tab + i) = 0; for (i = 0, j = 0; i < usrblk.dblk->ktcount; i++) if (usrblk.dblk->keytypes[i].is_selected) *(rec_type_tab + usrblk.dblk->keytypes[i].ktchar) = 1; saveusr.ktchars[j] = '\0'; /* * New code using shared memory: * Allocate global block of shared memory, * and assign parts of this memory to each array. * Stat array has an element for every possible db object. * Set whole stat array to binary zeroes. */ if (!init_global_memory (total_num_addrs, real_num_rec)) return; stat_array = (STAT_STR *) global_memory_ptr; word_addrs = (DB_ADDR *) (global_memory_ptr + total_num_addrs * sizeof (STAT_STR)); for (int32 = 0; int32 < total_num_addrs; int32++) { (stat_array + int32)->wght = 0.0; (stat_array + int32)->num_word_hits = 0; } /***** end of memory allocation for statistical array *********/ /* stat_search(): Search d99 and sum the statistical weights. * Calls efim_qsort() to sort the hitlist by the weights. */ begin_search = TRUE; /* global initialization and state flags */ begin_sort = TRUE; stat_search (); return; } /* ve_statistical() */ /*************************** VESTATIS.C ****************************/