This is irbuild.c in view mode; [Download] [Up]
/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ #ifndef lint static char *RCSid = "$Header: /tmp_mnt/net/quake/proj/wais/wais-8-b5/ir/RCS/irbuild.c,v 1.47 92/05/10 14:48:17 jonathan Exp $"; #endif /* * Building an index with a Unix shell interface. * * -brewster 6/90 */ /* Change log: * added -stdio option from jik@athena.mit.edu * $Log: irbuild.c,v $ * Revision 1.47 92/05/10 14:48:17 jonathan * Updated for release. * * Revision 1.46 92/05/08 10:03:17 jonathan * Adjusted memory paramters. It's closer... * * Revision 1.45 92/05/06 17:26:46 jonathan * Added switch for indexing contents, new user-specified type name, new type: * filename, which only puts the name of the file in the header. * * Revision 1.44 92/04/25 21:14:35 brewster * added ziff * * Revision 1.43 92/04/22 15:29:13 jonathan * Added jargon to usage message. * * Revision 1.42 92/04/01 17:08:50 jonathan * Added FTP type. * * Revision 1.41 92/03/25 18:49:39 jonathan * Added log_level and log_file arguments. * * Revision 1.40 92/03/22 18:38:14 brewster * added objective C filter * * Revision 1.39 92/03/20 11:02:44 jonathan * Added code to handle switches for word_pairs and word_postition info. * * Revision 1.38 92/03/17 07:34:32 jonathan * Fixed spacing in usage message. * * Revision 1.37 92/03/10 10:42:51 morris * fixed small bug in command line argument handleing. doesn't die if there * are no args. * * Revision 1.36 92/03/05 07:05:32 shen * add cm grow percent and textsize to command line and init search engine * * Revision 1.35 92/03/04 16:34:09 jonathan * Set wais_pid from getpid(). * * Revision 1.34 92/02/20 09:49:37 jonathan * Added bibtex and nhyp filters from S.P.vandeBurgt@research.ptt.nl. * * Revision 1.33 92/02/17 14:21:08 jonathan * Added switch to disable creation of catalog (-nocat). * * Revision 1.32 92/02/17 12:41:55 jonathan * Added RCSid. * * Revision 1.31 92/02/17 12:41:01 jonathan * Build catalog after completion of indexing. * * Revision 1.30 92/02/12 13:22:53 jonathan * Added "$Log" so RCS will put the log message in the header * */ /* to do: * done: make incremental indexing not index things that are already index * add extra arg -register that will send in description of the server to * the directory of servers. * done: create a source struct in the .src file * make it continuously index to keep itself uptodate. * */ #include <string.h> #include <sys/types.h> #include <sys/param.h> #include "irdirent.h" #include "cutil.h" #include "futil.h" #include "irfiles.h" #include "irtfiles.h" #include "panic.h" #include "ircfiles.h" #include "version.h" #include "irext.h" #define INDEXER_DATE "Sun May 10 1992" /* for reporting errors, in WAIStation it is defined in CRetrievalApp.c */ extern boolean indexingForBeta; void usage(command) char *command; { /* no args */ fprintf(stderr,"Usage: %s [-d index_filename]\n", command); fprintf(stderr," [-a] /* adding to an existing index, otherwise it erases the index */\n"); fprintf(stderr," [-r] /* recursively index subdirectories */\n"); fprintf(stderr," [-mem mbytes] /* number of megabytes to run this in */\n"); fprintf(stderr," [-register] /* registers the database with the directory of servers.\n"); fprintf(stderr," This should be done with care. */\n"); fprintf(stderr," [-export] /* uses short dbname and port 210 */\n"); fprintf(stderr," [-e [file]] /* set log output to file, or /dev/null if not specified */\n"); fprintf(stderr," [-l log_level] /* set log level. 0 means log nothing,\n"); fprintf(stderr," 10 [the default] means log everything */\n"); fprintf(stderr," [-v] /* print the version of the software */\n"); fprintf(stderr," [-stdin] /* read file names from stdin */\n"); fprintf(stderr," [-pos | -nopos] /* include (don't include - default) word position information /*\n"); fprintf(stderr," [-nopairs | -pairs] /* don't include (or include - default) word pairs /*\n"); fprintf(stderr," [-nocat] /* inhibit creation of catalog /*\n"); fprintf(stderr," [-contents] /* Index the contents: this is good for types that\n"); fprintf(stderr," inhibit the indexing of the contents (like gif). /*\n"); fprintf(stderr," [-nocontents] /* Index only the filename, not the contents /*\n"); fprintf(stderr," [-cmmem mem%] /* percent of CM memory (CM code only) */\n"); fprintf(stderr," [-T type] /* type becomes the \"TYPE\" of the document. */\n"); fprintf(stderr," [-t /* format of the file. if none then each file is a document */\n"); fprintf(stderr," text /* simple text files, this is the default */\n"); fprintf(stderr," | bibtex /* BibTeX / LaTeX format */\n"); fprintf(stderr," | bio /* biology abstract format */\n"); fprintf(stderr," | cmapp /* CM applications from Hypercard */\n"); fprintf(stderr," | dash /* entries separated by a row of dashes */\n"); fprintf(stderr," | dvi /* dvi format */\n"); fprintf(stderr," | emacsinfo /* the GNU documentation system */\n"); fprintf(stderr," | first_line /* first line of file is headline */\n"); fprintf(stderr," | filename /* uses only the filename part of the pathname for the title */\n"); fprintf(stderr," | ftp /* special type for FTP files. First line of file is headline */\n"); fprintf(stderr," | gif /* gif files, only indexes the filename */\n"); fprintf(stderr," | irg /* internet resource guide */\n"); fprintf(stderr," | jargon /* Jargon File 2.9.8 format*/\n"); fprintf(stderr," | mail_digest /* standard internet mail digest format */\n"); fprintf(stderr," | mail_or_rmail /* mail or rmail or both */\n"); fprintf(stderr," | medline /* medline format */\n"); fprintf(stderr," | mh_bboard /* MH bulletin board format */\n"); fprintf(stderr," | netnews /* netnews format */\n"); fprintf(stderr," | nhyp /* ?:? hyper text format, Polytechnic of Central London */\n"); fprintf(stderr," | one_line /* each line is a document */\n"); fprintf(stderr," | para /* paragraphs separated by blank lines */\n"); fprintf(stderr," | pict /* pict files, only indexes the filename */\n"); fprintf(stderr," | ps /* postscript format */\n"); fprintf(stderr," | refer /* refer format */\n"); fprintf(stderr," | rn /* netnews saved by the [rt]?rn newsreader */\n"); fprintf(stderr," | server /* server structures for the dir of servers */\n"); fprintf(stderr," | objc /* objective-C .h and .m files */\n"); fprintf(stderr," | tiff /* tiff files, only indexes the filename */\n"); fprintf(stderr," ] filename filename ...\n"); } char *log_file_name = NULL; FILE *logfile; extern boolean index_contents; /* This is the MAIN for building an index. */ void main(argc, argv) int argc; char *argv[]; { database* db = NULL; long argc_copy = argc; char **argv_copy = argv; char *next_argument; char index_filename[1000]; boolean (*separator_function)(); void (*header_function)(); void (*finish_header_function)(); long (*date_function)(); boolean adding_to_existing_index = false; boolean traverse_directory = false; boolean word_positions = false; boolean word_pairs = true; long memory_to_use = -1; long cm_mem_percent = 0; /* default */ long grow_percent = 0; /* default */ long text_size = 0; /* default */ boolean check_for_text_file = false; boolean register_database = false; boolean export_database = false; boolean read_files_from_stdin = false; boolean make_catalog = true; char data_filename[MAXPATHLEN]; char *typename = NULL; /* this is what the user said */ char *type = NULL; /* this is the type stored with the db */ long start_of_filenames; long hashtable_size = 1L<<16; long flush_after_n_words = 300000; char *command_name; next_argument = next_arg(&argc, &argv); separator_function = NULL; /* initailize to nil */ header_function = NULL; date_function = NULL; finish_header_function = NULL; type = "TEXT"; /* default to text */ typename = "Text"; command_name = next_argument; logfile = stderr; wais_pid = getpid(); if(0 == argc) { usage(command_name); exit(0); } #ifdef THINK_C strcpy(index_filename, "wais:System Folder:wais-index:index"); #else strcpy(index_filename, "index"); /* in the current directory */ #endif /* THINK_C */ if(NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"No arguments specified\n"); exit(0); } while((next_argument != NULL) && '-' == next_argument[0]){ /* then we have an argument to process */ if((0 == strcmp("-i", next_argument)) || /* -i is for backcompatibility */ (0 == strcmp("-d", next_argument))){ if(NULL == (next_argument = next_arg(&argc, &argv))){ fprintf(stderr,"Expected filename for the index\n"); exit(0); } strcpy(index_filename, next_argument); } else if(0 == strcmp("-a", next_argument)){ adding_to_existing_index = true; } else if(0 == strcmp("-r", next_argument)){ traverse_directory = true; } else if(0 == strcmp("-register", next_argument)){ register_database = true; } else if(0 == strcmp("-export", next_argument)){ export_database = true; } else if(0 == strcmp("-v", next_argument)){ fprintf(stderr,"%s: %s\n", command_name, VERSION, INDEXER_DATE); } else if (0 == strcmp("-stdin", next_argument)) { read_files_from_stdin = true; } else if (0 == strcmp("-nopos", next_argument)) { word_positions = false; } else if (0 == strcmp("-pos", next_argument)) { word_positions = true; } else if (0 == strcmp("-nopairs", next_argument)) { word_pairs = false; } else if (0 == strcmp("-pairs", next_argument)) { word_pairs = true; } else if (0 == strcmp("-nocat", next_argument)) { make_catalog = false; } else if(0 == strcmp("-mem", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number for the amount of memory to use"); memory_to_use = atol(next_argument); if(memory_to_use < 1) panic("The -mem argument should not be less than 1"); if(memory_to_use > 200) fprintf(stderr,"Warning: The -mem parameter was %ld Mbytes. That is a large number of mega bytes in current machines\n", memory_to_use); } else if(0 == strcmp("-cmmem", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number (1-100) for percentage of memory to use"); cm_mem_percent = atol(next_argument); if(cm_mem_percent < 1) panic("The -cmmem argument should not be less than 1 and less than 100"); if(cm_mem_percent > 100) panic("Warning: The -cmmem parameter was %ld%%. It should be between 1-100.", cm_mem_percent); } else if(0 == strcmp("-grow", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number (1-100) for database growing percentage"); grow_percent = atol(next_argument); if(grow_percent < 1) panic("The -grow argument should not be less than 1"); } else if(0 == strcmp("-textsize", next_argument)){ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a number for text size in megabytes"); text_size = atol(next_argument); if(text_size < 1) panic("The -textsize argument should not be less than 1"); } else if (0 == strcmp("-e", next_argument)) { char *peek_argument = peek_arg(&argc, &argv); log_file_name = "/dev/null"; /* default to /dev/null */ if ((peek_argument != NULL) && ('-' != peek_argument[0])) { log_file_name = next_arg(&argc, &argv); } /* end if (explicit log file) */ } /* end if (-e) */ else if (0 == strcmp("-l", next_argument)) { wais_log_level = atol(next_arg(&argc, &argv)); } /* end if (-l) */ else if(0 == strcmp("-cm", next_argument)){ /* this is an undocumented argument to help use this to front end the CM application */ indexingForBeta = true; } else if(0 == strcmp("-T", next_argument)){ /* This is a specification for a "Special" type. The next argument is the type name. This will not index the body of the file. */ if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a file type"); typename = next_argument; type = next_argument; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("-contents", next_argument)){ index_contents = true; } else if(0 == strcmp("-nocontents", next_argument)){ index_contents = false; } else if(0 == strcmp("-t", next_argument)){ /* then we have a specialized file */ index_contents = true; if(NULL == (next_argument = next_arg(&argc, &argv))) panic("Expected a file type"); if(0 == strcmp("groliers", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = groliers_separator_function; header_function = groliers_header_function; finish_header_function = groliers_finish_header_function; } else if(0 == strcmp("objc", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = wobjc_separator_function; header_function = wobjc_header_function; finish_header_function = wobjc_finish_header_function; } else if(0 == strcmp("mail", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = mail_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_or_rmail", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = mail_or_rmail_separator; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mail_digest", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = mail_digest_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("mh_bboard", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = mh_bboard_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rmail", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = rmail_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("netnews", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = NULL; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("rn", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = rn_separator_function; header_function = mail_header_function; date_function = mail_date_function; finish_header_function = mail_finish_header_function; } else if(0 == strcmp("emacsinfo", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = emacs_info_separator_function; header_function = emacs_info_header_function; finish_header_function = emacs_info_finish_header_function; } else if(0 == strcmp("catalog", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = catalog_separator_function; header_function = catalog_header_function; finish_header_function = catalog_finish_header_function; } else if(0 == strcmp("bio", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = bio_separator_function; header_function = bio_header_function; finish_header_function = bio_finish_header_function; } else if(0 == strcmp("cmapp", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = cmapp_separator_function; header_function = cmapp_header_function; finish_header_function = cmapp_finish_header_function; } else if(0 == strcmp("ftp", next_argument)){ type = "TEXT-FTP"; typename = next_argument; separator_function = first_line_separator_function; header_function = first_line_header_function; finish_header_function = first_line_finish_header_function; } else if(0 == strcmp("jargon", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = jargon_separator_function; header_function = jargon_header_function; finish_header_function = jargon_finish_header_function; } else if(0 == strcmp("server", next_argument)){ typename = next_argument; type = "WSRC"; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("text", next_argument)){ type = "TEXT"; typename = next_argument; check_for_text_file = true; } else if(0 == strcmp("filename", next_argument)){ type = "TEXT"; typename = next_argument; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("irg", next_argument)){ typename = next_argument; type = "TEXT"; separator_function = irg_separator_function; header_function = irg_header_function; finish_header_function = irg_finish_header_function; } /* dash-separated items , Intro to Algorithms buglist, etc */ else if(0 == strcmp("dash", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = dash_separator_function; header_function = dash_header_function; finish_header_function = dash_finish_header_function; } /* one_line-separated items */ else if(0 == strcmp("one_line", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = one_line_separator_function; header_function = one_line_header_function; finish_header_function = one_line_finish_header_function; } /* blank line-separated items (paragraphs) */ else if(0 == strcmp("para", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = para_separator_function; header_function = para_header_function; finish_header_function = para_finish_header_function; } /* seeker items */ else if(0 == strcmp("seeker", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = seeker_separator_function; header_function = seeker_header_function; finish_header_function = seeker_finish_header_function; } /* medline format */ else if(0 == strcmp("medline", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = medline_separator_function; header_function = medline_header_function; finish_header_function = medline_finish_header_function; } /* refer format */ else if(0 == strcmp("refer", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = refer_separator_function; header_function = refer_header_function; finish_header_function = refer_finish_header_function; } /* first_line format */ else if(0 == strcmp("first_line", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = first_line_separator_function; header_function = first_line_header_function; finish_header_function = first_line_finish_header_function; } /* rlin items */ else if(0 == strcmp("rlin", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = rlin_separator_function; header_function = rlin_header_function; finish_header_function = rlin_finish_header_function; } else if(0 == strcmp("dvi", next_argument)){ typename = next_argument; type = "DVI"; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("ps", next_argument)){ typename = next_argument; type = "PS"; finish_header_function = filename_finish_header_function; } else if(0 == strcmp("pict", next_argument)){ typename = next_argument; type = "PICT"; finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("gif", next_argument)){ typename = next_argument; type = "GIF"; finish_header_function = filename_finish_header_function; index_contents = false; } else if(0 == strcmp("tiff", next_argument)){ typename = next_argument; type = "TIFF"; finish_header_function = filename_finish_header_function; index_contents = false; } /* BibTeX items */ else if(0 == strcmp("bibtex", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = bibtex_separator_function; header_function = bibtex_header_function; finish_header_function = bibtex_finish_header_function; } /* ?:? seperated hypertext items */ else if(0 == strcmp("nhyp", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = nhyp_separator_function; header_function = nhyp_header_function; finish_header_function = nhyp_finish_header_function; } else if(0 == strcmp("ziff", next_argument)){ type = "TEXT"; typename = next_argument; separator_function = ziff_separator_function; header_function = ziff_header_function; finish_header_function = ziff_finish_header_function; } else{ panic("Don't recognize the '%s' type", next_argument); } } else{ panic("Don't recognize the '%s' option", next_argument); } next_argument = next_arg(&argc, &argv); if (! (read_files_from_stdin || next_argument)) { fprintf(stderr,"No files specified\n"); exit(0); } } start_of_filenames = argc_copy - argc - 1; /* check index */ if(0 == strlen(pathname_name(index_filename))){ waislog(WLOG_HIGH, WLOG_ERROR, "The pathname specified for the destination of the index files ('%s') should have a leaf filename without an extention rather than just a directory.", index_filename); exit(0); } waislog(WLOG_MEDIUM, WLOG_INDEX, "Starting to build database %s", index_filename); if(0 != init_search_engine(index_filename, false, false, cm_mem_percent, text_size, grow_percent)) panic("unable to initialize search engine"); if(true == adding_to_existing_index){ db = openDatabase(index_filename, false, false); if (db == NULL){ /* does not exist, create one */ db = openDatabase(index_filename, true, false); if (db == NULL) panic("unable to open the database"); } } else{ db = openDatabase(index_filename, true, false); if (db == NULL) panic("unable to open the database"); } { /* set up the memory hashtable */ if(memory_to_use < 0){ /* default */ /* do nothing */ } else if(memory_to_use <= 2){ hashtable_size = 1L<<16; flush_after_n_words = 50000; } else if(memory_to_use <= 5){ hashtable_size = 1L<<16; flush_after_n_words = 150000; } else if(memory_to_use <= 10){ /* shown to take about 6MB on a sun4, when it is dict limited */ hashtable_size = 1L<<16; flush_after_n_words = 300000; } else if(memory_to_use <= 20){ hashtable_size = 1L<<17; flush_after_n_words = 600000; } else{ /* over 20 Mbytes */ hashtable_size = 1L<<18; flush_after_n_words = 1000000; } init_add_word(db, hashtable_size, flush_after_n_words); } if (read_files_from_stdin) { if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) { int len = strlen(next_argument); if (next_argument[len-1] == '\n') { next_argument[len-1] = '\0'; } } } while(NULL != next_argument){ /* the first filename is in next_argument already */ if(directoryp(next_argument)){ if(traverse_directory){ index_directory(next_argument, separator_function, header_function, date_function, finish_header_function, type, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs); } } else{ /* not a directory */ waislog(WLOG_MEDIUM, WLOG_INDEX, "Indexing file: %s", next_argument); index_text_file(next_argument, separator_function, header_function, date_function, finish_header_function, type, db, check_for_text_file, adding_to_existing_index, word_positions, word_pairs); } if (read_files_from_stdin) { if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) { int len = strlen(next_argument); if (next_argument[len-1] == '\n') { next_argument[len-1] = '\0'; } } } else { next_argument = next_arg(&argc, &argv); } } finished_add_word(db); { char filename[MAX_FILENAME_LEN + 1]; if(!probe_file(source_filename(filename, db))){ char database_name[MAX_FILENAME_LEN]; write_src_structure(source_filename(filename, db), export_database?pathname_name(index_filename): truename(index_filename, database_name), typename, &argv_copy[start_of_filenames], argc_copy - start_of_filenames, export_database, 210L); } /* write out a description of the server if appropriate */ if(register_database){ register_src_structure(source_filename(filename, db)); } } if(make_catalog) build_catalog(db); closeDatabase(db); waislog(WLOG_MEDIUM, WLOG_INDEX, "Finished build"); exit(0); }
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.