This is charset.c in view mode; [Download] [Up]
/* Conversion of files between different charsets and usages. Copyright (C) 1993, 1994 Free Software Foundation, Inc. Francois Pinard <pinard@iro.umontreal.ca>, 1993. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "recode.h" /* Maximum number of charset values. */ #define MAX_CHARSETS 200 /* Hash table size for charset names. */ #define HASH_TABLE_SIZE 997 /* Maximum number of characters per 10646 symbol. */ #define MAX_SYMBOL_SIZE 9 /* Known pairs (for restricting listing). */ static struct known_pair *pair_restriction = NULL; static int pair_restrictions = 0; /* Known character sets. */ struct hash { const char *name; /* charset or alias name, or NULL */ CHARSET *charset; /* associated charset */ struct hash *next; /* next index in table, or NULL */ }; struct hash hash_table[HASH_TABLE_SIZE]; CHARSET charset_array[MAX_CHARSETS]; int number_of_charsets; /* Array of strings ready for argmatch. */ static const char **argmatch_array; /* Character names. */ /* This module takes care only of short 10646 forms. Module charname.c takes care of the full descriptive name for characters. */ /*--------------------------------------------------------------------. | Return a statically allocated 10646 symbol in a CHARSET for a given | | CODE, or NULL if this symbol is not defined. There are two static | | buffers used in alternance. | `--------------------------------------------------------------------*/ static char * code_to_symbol (CHARSET *charset, int code) { static char buffer[2][MAX_SYMBOL_SIZE + 1]; static int which = 0; const char *in; char *out; int counter; if (in = (*charset->table)[code / 32], !in) return NULL; in += charset->size * (code % 32); if (*in == ' ') return NULL; which = !which; out = buffer[which]; for (counter = 0; counter < charset->size; counter++) if (*in == ' ') in++; else *out++ = *in++; *out = '\0'; return buffer[which]; } /*------------------------------------------------------------------------. | Print a 10646 symbol in a CHARSET for a given CODE, padding with spaces | | after to the proper width. | `------------------------------------------------------------------------*/ static void print_symbol (CHARSET *charset, int code) { int counter; char *cursor; counter = 0; cursor = code_to_symbol (charset, code); if (cursor) for (; *cursor && counter < charset->size; counter++) { putchar (*cursor); cursor++; } for (; counter < charset->size; counter++) putchar (' '); } /*-----------------------------------------------------------------. | Decode a known PAIRS argument, given in STRING, constructing the | | pair_restriction array out of it. | `-----------------------------------------------------------------*/ void decode_known_pairs (const char *string) { struct known_pair pair; const char *cursor; int value; pair_restriction = (struct known_pair *) xmalloc (16 * sizeof (struct known_pair)); value = -1; for (cursor = string; *cursor; cursor++) switch (*cursor) { default: usage (EXIT_FAILURE); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (value < 0) value = *cursor - '0'; else value = 10 * value + *cursor - '0'; break; case ':': if (value < 0 || value > 255) usage (EXIT_FAILURE); pair_restriction[pair_restrictions].left = (unsigned char) value; value = -1; break; case ',': if (value < 0 || value > 255) usage (EXIT_FAILURE); pair_restriction[pair_restrictions++].right = (unsigned char) value; if (pair_restrictions % 16 == 0) pair_restriction = (struct known_pair *) xrealloc (pair_restriction, (pair_restrictions + 16) * sizeof (struct known_pair)); value = -1; break; } if (value < 0 || value > 255) usage (EXIT_FAILURE); pair_restriction[pair_restrictions++].right = (unsigned char) value; } /*-------------------------------------------------------------. | Return non-zero if BEFORE to AFTER is currently restricted. | `-------------------------------------------------------------*/ static int check_restricted (CHARSET *before, CHARSET *after) { struct known_pair *pair; const char *left; const char *right; /* Reject the charset if not RFC1345. */ if (!before->table || !after->table) return 1; for (pair = pair_restriction; pair < pair_restriction + pair_restrictions; pair++) { /* Reject the charset if the characters in the pair do not exist of if their respective definition do not match. */ left = code_to_symbol (before, pair->left); if (!left) return 1; right = code_to_symbol (after, pair->right); if (!right) return 1; if (strcmp (left, right)) return 1; } /* No restriction found. */ return 0; } /* Charset names. */ /*--------------------------------------. | Prepare charsets for initialization. | `--------------------------------------*/ void prepare_charset_initialization (void) { int counter; for (counter = 0; counter < HASH_TABLE_SIZE; counter++) { hash_table[counter].name = NULL; hash_table[counter].next = NULL; } number_of_charsets = 0; } /*-----------------------------------------------------------------------. | Return a newly allocated copy of charset NAME, with upper case letters | | turned into lower case, and all non alphanumeric discarded. | `-----------------------------------------------------------------------*/ static char * cleanup_charset_name (const char *name) { char *result; char *out; const char *in; int character; result = xmalloc (strlen (name) + 1); out = result; for (in = name; *in; in++) { character = *(const unsigned char *) in; if ((character >= 'a' && character <= 'z') || (character >= '0' && character <= '9')) *out++ = character; else if (character >= 'A' && character <= 'Z') *out++ = character - 'A' + 'a'; } *out = '\0'; return result; } /*-----------------------------------. | Return a hash index for a STRING. | `-----------------------------------*/ #ifdef DIFF_HASH /* Given a hash value and a new character, return a new hash value. */ #define UINT_BIT (sizeof (unsigned) * CHAR_BIT) #define ROTATE_LEFT(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n))) #define HASH(h, c) ((c) + ROTATE_LEFT (h, 7)) static int hash_string (const char *string) { unsigned value; value = 0; for (; *string; string++) value = HASH (value, *(const unsigned char *) string); return value % HASH_TABLE_SIZE; } #else /* not DIFF_HASH */ static int hash_string (const char *string) { unsigned value; value = 0; for (; *string; string++) value = ((value * 31 + (int) *(const unsigned char *) string) % HASH_TABLE_SIZE); return value; } #endif /* not DIFF_HASH */ /*--------------------------------------------------------------------------. | Return the charset from its NAME or alias name. If it does not already | | exist, add a new charset entry and initialize it with a brand new value. | `--------------------------------------------------------------------------*/ CHARSET * find_charset (const char *name) { char *hashname; struct hash *hash; CHARSET *charset; /* Search the whole hash bucket and return any match. */ hashname = cleanup_charset_name (name); for (hash = hash_table + hash_string (hashname); hash->name; hash = hash->next) { if (strcmp (hashname, hash->name) == 0) { free (hashname); return hash->charset; } if (!hash->next) break; } /* A new charset has to be created. */ if (number_of_charsets == MAX_CHARSETS) error (EXIT_FAILURE, 0, "MAX_CHARSETS is too small"); charset = charset_array + number_of_charsets++; /* If the current slot is already used, create an overflow entry and initialize it enough so it could be taken for the current slot. */ if (hash->name) { hash->next = (struct hash *) xmalloc (sizeof (struct hash)); hash = hash->next; hash->next = NULL; } /* Initialize the current slot with the new charset. */ hash->name = hashname; hash->charset = charset; charset->name = name; charset->ignore = 0; charset->table = NULL; return charset; } /*-------------------------------------------------------------------------. | Have NAME as an alternate charset name for OLD_NAME. Create OLD_NAME if | | it does not exist already. | `-------------------------------------------------------------------------*/ void declare_alias (const char *name, const char *old_name) { char *hashname; struct hash *hash; CHARSET *old_charset; /* Find the old value. */ old_charset = find_charset (old_name); /* Search the whole hash bucket. */ hashname = cleanup_charset_name (name); for (hash = hash_table + hash_string (hashname); hash->name; hash = hash->next) { if (strcmp (hashname, hash->name) == 0) { if (hash->charset != old_charset) error (EXIT_FAILURE, 0, "Charset %s already exists and is not %s", name, old_name); free (hashname); return; } if (!hash->next) break; } /* If the current slot is already used, create an overflow entry and initialize it enough so it could be taken for the current slot. */ if (hash->name) { hash->next = (struct hash *) xmalloc (sizeof (struct hash)); hash = hash->next; hash->next = NULL; } /* Initialize the current slot with the old charset. */ hash->name = hashname; hash->charset = old_charset; } /*------------------------------------------. | Construct the string array for argmatch. | `------------------------------------------*/ void make_argmatch_array (void) { struct hash *hash; /* cursor in charsets */ int number; /* number of strings */ int counter; /* all purpose counter */ #ifdef HASH_STATS int buckets; /* number of non-empty buckets */ #endif /* Count how many strings we need. */ number = 0; for (counter = 0; counter < HASH_TABLE_SIZE; counter++) for (hash = hash_table + counter; hash && hash->name; hash = hash->next) number++; #ifdef HASH_STATS buckets = 0; for (counter = 0; counter < HASH_TABLE_SIZE; counter++) if (hash_table[counter].name) buckets++; fprintf (stderr, "Hash stats: %d names using %d buckets out of %d\n", number, buckets, HASH_TABLE_SIZE); #endif /* Allocate the argmatch array, with place for a NULL sentinel. */ argmatch_array = (const char **) xmalloc ((number + 1) * sizeof (const char *)); /* Fill in the array. */ number = 0; for (counter = 0; counter < HASH_TABLE_SIZE; counter++) for (hash = hash_table + counter; hash && hash->name; hash = hash->next) argmatch_array[number++] = hash->name; argmatch_array[number] = NULL; } /*-----------------------------------------------------------------------. | Return the NAME of a charset, un-abbreviated and cleaned up. Diagnose | | and abort if this cannot be done successfully. A NULL or empty string | | means the default charset, if this default charset is defined. | `-----------------------------------------------------------------------*/ const char * clean_charset_name (const char *name) { char *hashname; int ordinal; /* Look for a match. */ if (!name) name = ""; #ifdef DEFAULT_CHARSET if (!*name) name = DEFAULT_CHARSET; #endif hashname = cleanup_charset_name (name); ordinal = argmatch (hashname, argmatch_array); free (hashname); /* Diagnose any match error, notifying usage that we are decoding charsets. */ switch (ordinal) { case -2: error (0, 0, "Ambiguous charset `%s'", name); decoding_charset_flag = 1; usage (EXIT_FAILURE); case -1: error (0, 0, "Unknown charset `%s'", name); decoding_charset_flag = 1; usage (EXIT_FAILURE); } return argmatch_array[ordinal]; } /*----------------------------------------------------------------------. | Order two struct hash's, using the true charset name as the first key | | and the current name as the second key. | `----------------------------------------------------------------------*/ static int compare_struct_hash (const void *void_first, const void *void_second) { int value; value = strcmp (((const struct hash *) void_first)->charset->name, ((const struct hash *) void_second)->charset->name); if (value != 0) return value; value = strcmp (((const struct hash *) void_first)->name, ((const struct hash *) void_second)->name); return value; } /*-----------------------------------------------------------------------. | List all available charsets, obeying restrictions for an AFTER charset | | if any. | `-----------------------------------------------------------------------*/ void list_all_charsets (CHARSET *after) { struct hash *array; struct hash *hash; int number; int counter; int list_flag; /* Count how many charsets we have. */ number = 0; for (counter = 0; counter < HASH_TABLE_SIZE; counter++) for (hash = hash_table + counter; hash && hash->name; hash = hash->next) number++; /* Allocate a structure to hold them. */ array = (struct hash *) xmalloc (number * sizeof (struct hash)); /* Copy all charsets in it. */ number = 0; for (counter = 0; counter < HASH_TABLE_SIZE; counter++) for (hash = hash_table + counter; hash && hash->name; hash = hash->next) array[number++] = *hash; /* Sort it. */ qsort (array, number, sizeof (struct hash), compare_struct_hash); /* Print it, one line per charset, giving the true charset name first, followed by all its alias in lexicographic order. */ for (hash = array; hash < array + number; hash++) { /* Begin a new line with the true charset name when it changes. */ if (hash == array || hash->charset->name != (hash - 1)->charset->name) { if (list_flag && hash != array) printf ("\n"); list_flag = !after || !check_restricted (hash->charset, after); if (list_flag) printf ("%s", hash->charset->name); } /* Print the charset name or alias in its cleaned up form. */ if (list_flag) printf (" %s", hash->name); } if (list_flag) printf ("\n"); /* Release the work array. */ free (array); } /* Charset contents. */ /*-----------------------------------------------------------------------. | For a given STEP recoding into a RFC 1345 charset, produce an explicit | | recoding table. | `-----------------------------------------------------------------------*/ void init_table_for_rfc1345 (STEP *step) { const char *symbol; char *pool; const char **table; int length; int counter; /* First compute how much memory is needed. */ length = 0; for (counter = 0; counter < 256; counter++) { symbol = code_to_symbol (step->before, counter); if (symbol) length += strlen (symbol) + 1; } /* Do the actual allocation and filling. */ table = (const char **) xmalloc (256 * sizeof (char *) + length); pool = (char *) (table + 256); for (counter = 0; counter < 256; counter++) { symbol = code_to_symbol (step->before, counter); if (symbol) { if (strcmp (symbol, "SP") == 0) symbol = " "; else if (strcmp (symbol, "LF") == 0) symbol = "\n"; table[counter] = pool; while (*pool++ = *symbol++) ; } else table[counter] = NULL; } step->one_to_many = table; } /*------------------------------------------------------------------. | Print a concise, tabular CHARSET description on standard output. | `------------------------------------------------------------------*/ void list_concise_charset (CHARSET *charset) { DOUBLE_TABLE *table; /* double table */ int half; /* half 0, half 1 of the table */ const char *format; /* format string */ int counter; /* code counter */ int counter2; /* code counter */ int code; /* code value */ /* Ensure we have a double table to play with. */ if (charset->table) table = charset->table; else error (EXIT_FAILURE, 0, "Cannot list `%s', no 10646 names available for this charset", charset->name); printf ("%s\n", charset->name); /* Select format for numbers. */ switch (list_format) { default: return; /* cannot happen */ case NO_FORMAT: case DECIMAL_FORMAT: format = "%3d"; break; case OCTAL_FORMAT: format = "%0.3o"; break; case HEXADECIMAL_FORMAT: format = "%0.2x"; break; } /* Print both halves of the table. */ for (half = 0; half < 2; half++) { /* Skip printing this half if it is empty. */ for (counter = 4 * half; counter < 4 * half + 4; counter++) if ((*table)[counter]) break; if (counter == 4 * half + 4) continue; /* Print this half. */ printf ("\n"); for (counter = 128 * half; counter < 128 * half + 16; counter++) for (counter2 = 0; counter2 < 128; counter2 += 16) { if (counter2 > 0) printf (" "); code = counter + counter2; printf (format, code); printf (" "); print_symbol (charset, code); if (counter2 == 112) printf ("\n"); } } } /*------------------------------------------------------. | Print a full CHARSET description on standard output. | `------------------------------------------------------*/ void list_full_charset (CHARSET *charset) { int insert_white; /* insert a while line before printing */ int code; /* code counter */ const char *symbol; /* symbol for code */ const char *charname; /* charname for code */ /* Ensure we have a double table to play with. */ if (!charset->table) error (EXIT_FAILURE, 0, "Sorry, no 10646 names available for `%s'", charset->name); /* Print the long table. */ printf ("dec oct hex ch %s\n", charset->name); insert_white = 1; for (code = 0; code < 256; code++) if ((symbol = code_to_symbol (charset, code)), symbol) { if (insert_white) { printf ("\n"); insert_white = 0; } printf ("%3d %0.3o %0.2x ", code, code, code); print_symbol (charset, code); if ((charname = symbol_to_charname (symbol)), charname) printf (" %s", charname); printf ("\n"); } else insert_white = 1; }
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.