charset.c

This is charset.c in view mode; [Download] [Up]
/* Conversion of files between different charsets and usages.
   Copyright (C) 1993, 1994 Free Software Foundation, Inc.
   Francois Pinard <pinard@iro.umontreal.ca>, 1993.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

#include "recode.h"

/* Maximum number of charset values.  */
#define MAX_CHARSETS 200

/* Hash table size for charset names.  */
#define HASH_TABLE_SIZE 997

/* Maximum number of characters per 10646 symbol.  */
#define MAX_SYMBOL_SIZE 9

/* Known pairs (for restricting listing).  */

static struct known_pair *pair_restriction = NULL;
static int pair_restrictions = 0;

/* Known character sets.  */

struct hash
  {
    const char *name;		/* charset or alias name, or NULL */
    CHARSET *charset;		/* associated charset */
    struct hash *next;		/* next index in table, or NULL */
  };
struct hash hash_table[HASH_TABLE_SIZE];
CHARSET charset_array[MAX_CHARSETS];
int number_of_charsets;

/* Array of strings ready for argmatch.  */
static const char **argmatch_array;

/* Character names.  */

/* This module takes care only of short 10646 forms.  Module charname.c
   takes care of the full descriptive name for characters.  */

/*--------------------------------------------------------------------.
| Return a statically allocated 10646 symbol in a CHARSET for a given |
| CODE, or NULL if this symbol is not defined.  There are two static  |
| buffers used in alternance.					      |
`--------------------------------------------------------------------*/

static char *
code_to_symbol (CHARSET *charset, int code)
{
  static char buffer[2][MAX_SYMBOL_SIZE + 1];
  static int which = 0;
  const char *in;
  char *out;
  int counter;

  if (in = (*charset->table)[code / 32], !in)
    return NULL;

  in += charset->size * (code % 32);
  if (*in == ' ')
    return NULL;

  which = !which;
  out = buffer[which];
  for (counter = 0; counter < charset->size; counter++)
    if (*in == ' ')
      in++;
    else
      *out++ = *in++;
  *out = '\0'; 
  return buffer[which];
}

/*------------------------------------------------------------------------.
| Print a 10646 symbol in a CHARSET for a given CODE, padding with spaces |
| after to the proper width.						  |
`------------------------------------------------------------------------*/

static void
print_symbol (CHARSET *charset, int code)
{
  int counter;
  char *cursor;

  counter = 0;
  cursor = code_to_symbol (charset, code);
  
  if (cursor)
    for (; *cursor && counter < charset->size; counter++)
      {
	putchar (*cursor);
	cursor++;
      }
  for (; counter < charset->size; counter++)
    putchar (' ');
}

/*-----------------------------------------------------------------.
| Decode a known PAIRS argument, given in STRING, constructing the |
| pair_restriction array out of it.				   |
`-----------------------------------------------------------------*/

void
decode_known_pairs (const char *string)
{
  struct known_pair pair;
  const char *cursor;
  int value;

  pair_restriction = (struct known_pair *)
    xmalloc (16 * sizeof (struct known_pair));

  value = -1;
  for (cursor = string; *cursor; cursor++)
    switch (*cursor)
      {
      default:
	usage (EXIT_FAILURE);

      case '0':
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
      case '8':
      case '9':
	if (value < 0)
	  value = *cursor - '0';
	else
	  value = 10 * value + *cursor - '0';
	break;

      case ':':
	if (value < 0 || value > 255)
	  usage (EXIT_FAILURE);
	pair_restriction[pair_restrictions].left = (unsigned char) value;
	value = -1;
	break;

      case ',':
	if (value < 0 || value > 255)
	  usage (EXIT_FAILURE);
	pair_restriction[pair_restrictions++].right = (unsigned char) value;
	if (pair_restrictions % 16 == 0)
	  pair_restriction = (struct known_pair *)
	    xrealloc (pair_restriction,
		      (pair_restrictions + 16) * sizeof (struct known_pair));
	value = -1;
	break;
      }

  if (value < 0 || value > 255)
    usage (EXIT_FAILURE);
  pair_restriction[pair_restrictions++].right = (unsigned char) value;
}

/*-------------------------------------------------------------.
| Return non-zero if BEFORE to AFTER is currently restricted.  |
`-------------------------------------------------------------*/

static int
check_restricted (CHARSET *before, CHARSET *after)
{
  struct known_pair *pair;
  const char *left;
  const char *right;

  /* Reject the charset if not RFC1345.  */

  if (!before->table || !after->table)
    return 1;

  for (pair = pair_restriction;
       pair < pair_restriction + pair_restrictions;
       pair++)
    {

      /* Reject the charset if the characters in the pair do not exist of
	 if their respective definition do not match.  */

      left = code_to_symbol (before, pair->left);
      if (!left)
	return 1;
      right = code_to_symbol (after, pair->right);
      if (!right)
	return 1;
      if (strcmp (left, right))
	return 1;
    }

  /* No restriction found.  */

  return 0;
}

/* Charset names.  */

/*--------------------------------------.
| Prepare charsets for initialization.  |
`--------------------------------------*/

void
prepare_charset_initialization (void)
{
  int counter;

  for (counter = 0; counter < HASH_TABLE_SIZE; counter++)
    {
      hash_table[counter].name = NULL;
      hash_table[counter].next = NULL;
    }
  number_of_charsets = 0;
}

/*-----------------------------------------------------------------------.
| Return a newly allocated copy of charset NAME, with upper case letters |
| turned into lower case, and all non alphanumeric discarded.		 |
`-----------------------------------------------------------------------*/

static char *
cleanup_charset_name (const char *name)
{
  char *result;
  char *out;
  const char *in;
  int character;

  result = xmalloc (strlen (name) + 1);
  out = result;
  for (in = name; *in; in++)
    {
      character = *(const unsigned char *) in;
      if ((character >= 'a' && character <= 'z')
	  || (character >= '0' && character <= '9'))
	*out++ = character;
      else if (character >= 'A' && character <= 'Z')
	*out++ = character - 'A' + 'a';
    }
  *out = '\0';
  return result;
}

/*-----------------------------------.
| Return a hash index for a STRING.  |
`-----------------------------------*/

#ifdef DIFF_HASH

/* Given a hash value and a new character, return a new hash value.  */

#define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
#define ROTATE_LEFT(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
#define HASH(h, c) ((c) + ROTATE_LEFT (h, 7))

static int
hash_string (const char *string)
{
  unsigned value;

  value = 0;
  for (; *string; string++)
    value = HASH (value, *(const unsigned char *) string);
  return value % HASH_TABLE_SIZE;
}

#else /* not DIFF_HASH */

static int
hash_string (const char *string)
{
  unsigned value;

  value = 0;
  for (; *string; string++)
    value = ((value * 31 + (int) *(const unsigned char *) string)
	     % HASH_TABLE_SIZE);
  return value;
}

#endif /* not DIFF_HASH */

/*--------------------------------------------------------------------------.
| Return the charset from its NAME or alias name.  If it does not already   |
| exist, add a new charset entry and initialize it with a brand new value.  |
`--------------------------------------------------------------------------*/

CHARSET *
find_charset (const char *name)
{
  char *hashname;
  struct hash *hash;
  CHARSET *charset;

  /* Search the whole hash bucket and return any match.  */

  hashname = cleanup_charset_name (name);
  for (hash = hash_table + hash_string (hashname);
       hash->name;
       hash = hash->next)
    {
      if (strcmp (hashname, hash->name) == 0)
	{
	  free (hashname);
	  return hash->charset;
	}
      if (!hash->next)
	break;
    }

  /* A new charset has to be created.  */

  if (number_of_charsets == MAX_CHARSETS)
    error (EXIT_FAILURE, 0, "MAX_CHARSETS is too small");

  charset = charset_array + number_of_charsets++;

  /* If the current slot is already used, create an overflow entry and
     initialize it enough so it could be taken for the current slot.  */

  if (hash->name)
    {
      hash->next = (struct hash *) xmalloc (sizeof (struct hash));
      hash = hash->next;
      hash->next = NULL;
    }

  /* Initialize the current slot with the new charset.  */

  hash->name = hashname;
  hash->charset = charset;

  charset->name = name;
  charset->ignore = 0;
  charset->table = NULL;

  return charset;
}

/*-------------------------------------------------------------------------.
| Have NAME as an alternate charset name for OLD_NAME.  Create OLD_NAME if |
| it does not exist already.						   |
`-------------------------------------------------------------------------*/

void
declare_alias (const char *name, const char *old_name)
{
  char *hashname;
  struct hash *hash;
  CHARSET *old_charset;

  /* Find the old value.  */

  old_charset = find_charset (old_name);

  /* Search the whole hash bucket.  */

  hashname = cleanup_charset_name (name);
  for (hash = hash_table + hash_string (hashname);
       hash->name;
       hash = hash->next)
    {
      if (strcmp (hashname, hash->name) == 0)
	{
	  if (hash->charset != old_charset)
	    error (EXIT_FAILURE, 0, "Charset %s already exists and is not %s",
		   name, old_name);
	  free (hashname);
	  return;
	}
      if (!hash->next)
	break;
    }

  /* If the current slot is already used, create an overflow entry and
     initialize it enough so it could be taken for the current slot.  */

  if (hash->name)
    {
      hash->next = (struct hash *) xmalloc (sizeof (struct hash));
      hash = hash->next;
      hash->next = NULL;
    }

  /* Initialize the current slot with the old charset.  */

  hash->name = hashname;
  hash->charset = old_charset;
}

/*------------------------------------------.
| Construct the string array for argmatch.  |
`------------------------------------------*/

void
make_argmatch_array (void)
{
  struct hash *hash;		/* cursor in charsets */
  int number;			/* number of strings */
  int counter;			/* all purpose counter */
#ifdef HASH_STATS
  int buckets;			/* number of non-empty buckets */
#endif

  /* Count how many strings we need.  */

  number = 0;
  for (counter = 0; counter < HASH_TABLE_SIZE; counter++)
    for (hash = hash_table + counter;
	 hash && hash->name;
	 hash = hash->next)
      number++;

#ifdef HASH_STATS
  buckets = 0;
  for (counter = 0; counter < HASH_TABLE_SIZE; counter++)
    if (hash_table[counter].name)
      buckets++;

  fprintf (stderr, "Hash stats: %d names using %d buckets out of %d\n",
	   number, buckets, HASH_TABLE_SIZE);
#endif

  /* Allocate the argmatch array, with place for a NULL sentinel.  */

  argmatch_array
    = (const char **) xmalloc ((number + 1) * sizeof (const char *));

  /* Fill in the array.  */

  number = 0;
  for (counter = 0; counter < HASH_TABLE_SIZE; counter++)
    for (hash = hash_table + counter;
	 hash && hash->name;
	 hash = hash->next)
      argmatch_array[number++] = hash->name;

  argmatch_array[number] = NULL;
}

/*-----------------------------------------------------------------------.
| Return the NAME of a charset, un-abbreviated and cleaned up.  Diagnose |
| and abort if this cannot be done successfully.  A NULL or empty string |
| means the default charset, if this default charset is defined.	 |
`-----------------------------------------------------------------------*/

const char *
clean_charset_name (const char *name)
{
  char *hashname;
  int ordinal;

  /* Look for a match.  */

  if (!name)
    name = "";
#ifdef DEFAULT_CHARSET
  if (!*name)
    name = DEFAULT_CHARSET;
#endif
  hashname = cleanup_charset_name (name);
  ordinal = argmatch (hashname, argmatch_array);
  free (hashname);

  /* Diagnose any match error, notifying usage that we are decoding
     charsets.  */

  switch (ordinal)
    {
    case -2:
      error (0, 0, "Ambiguous charset `%s'", name);
      decoding_charset_flag = 1;
      usage (EXIT_FAILURE);

    case -1:
      error (0, 0, "Unknown charset `%s'", name);
      decoding_charset_flag = 1;
      usage (EXIT_FAILURE);
    }

  return argmatch_array[ordinal];
}

/*----------------------------------------------------------------------.
| Order two struct hash's, using the true charset name as the first key |
| and the current name as the second key.			        |
`----------------------------------------------------------------------*/

static int
compare_struct_hash (const void *void_first, const void *void_second)
{
  int value;

  value = strcmp (((const struct hash *) void_first)->charset->name,
		  ((const struct hash *) void_second)->charset->name);
  if (value != 0)
    return value;
  
  value = strcmp (((const struct hash *) void_first)->name,
		  ((const struct hash *) void_second)->name);
  return value;
}

/*-----------------------------------------------------------------------.
| List all available charsets, obeying restrictions for an AFTER charset |
| if any.								 |
`-----------------------------------------------------------------------*/

void
list_all_charsets (CHARSET *after)
{
  struct hash *array;
  struct hash *hash;
  int number;
  int counter;
  int list_flag;

  /* Count how many charsets we have.  */

  number = 0;
  for (counter = 0; counter < HASH_TABLE_SIZE; counter++)
    for (hash = hash_table + counter;
	 hash && hash->name;
	 hash = hash->next)
      number++;

  /* Allocate a structure to hold them.  */

  array = (struct hash *) xmalloc (number * sizeof (struct hash));

  /* Copy all charsets in it.  */

  number = 0;
  for (counter = 0; counter < HASH_TABLE_SIZE; counter++)
    for (hash = hash_table + counter;
	 hash && hash->name;
	 hash = hash->next)
      array[number++] = *hash;

  /* Sort it.  */

  qsort (array, number, sizeof (struct hash), compare_struct_hash);

  /* Print it, one line per charset, giving the true charset name first,
     followed by all its alias in lexicographic order.  */

  for (hash = array; hash < array + number; hash++)
    {

      /* Begin a new line with the true charset name when it changes.  */

      if (hash == array || hash->charset->name != (hash - 1)->charset->name)
	{
	  if (list_flag && hash != array)
	    printf ("\n");

	  list_flag = !after || !check_restricted (hash->charset, after);

	  if (list_flag)
	    printf ("%s", hash->charset->name);
	}

      /* Print the charset name or alias in its cleaned up form.  */

      if (list_flag)
	printf (" %s", hash->name);
    }
  if (list_flag)
    printf ("\n");

  /* Release the work array.  */

  free (array);
}

/* Charset contents.  */

/*-----------------------------------------------------------------------.
| For a given STEP recoding into a RFC 1345 charset, produce an explicit |
| recoding table.							 |
`-----------------------------------------------------------------------*/

void
init_table_for_rfc1345 (STEP *step)
{
  const char *symbol;
  char *pool;
  const char **table;
  int length;
  int counter;

  /* First compute how much memory is needed.  */

  length = 0;
  for (counter = 0; counter < 256; counter++)
    {
      symbol = code_to_symbol (step->before, counter);
      if (symbol)
	length += strlen (symbol) + 1;
    }

  /* Do the actual allocation and filling.  */

  table = (const char **) xmalloc (256 * sizeof (char *) + length);
  pool = (char *) (table + 256);

  for (counter = 0; counter < 256; counter++)
    {
      symbol = code_to_symbol (step->before, counter);
      if (symbol)
	{
	  if (strcmp (symbol, "SP") == 0)
	    symbol = " ";
	  else if (strcmp (symbol, "LF") == 0)
	    symbol = "\n";

	  table[counter] = pool;
	  while (*pool++ = *symbol++)
	    ;
	}
      else
	table[counter] = NULL;
    }

  step->one_to_many = table;
}

/*------------------------------------------------------------------.
| Print a concise, tabular CHARSET description on standard output.  |
`------------------------------------------------------------------*/

void
list_concise_charset (CHARSET *charset)
{
  DOUBLE_TABLE *table;		/* double table */
  int half;			/* half 0, half 1 of the table */
  const char *format;		/* format string */
  int counter;			/* code counter */
  int counter2;			/* code counter */
  int code;			/* code value */

  /* Ensure we have a double table to play with.  */

  if (charset->table)
    table = charset->table;
  else
    error (EXIT_FAILURE, 0,
	   "Cannot list `%s', no 10646 names available for this charset",
	   charset->name);

  printf ("%s\n", charset->name);

  /* Select format for numbers.  */

  switch (list_format)
    {
    default:
      return;			/* cannot happen */

    case NO_FORMAT:
    case DECIMAL_FORMAT:
      format = "%3d";
      break;

    case OCTAL_FORMAT:
      format = "%0.3o";
      break;

    case HEXADECIMAL_FORMAT:
      format = "%0.2x";
      break;
    }

  /* Print both halves of the table.  */

  for (half = 0; half < 2; half++)
    {

      /* Skip printing this half if it is empty.  */

      for (counter = 4 * half; counter < 4 * half + 4; counter++)
	if ((*table)[counter])
	  break;
      if (counter == 4 * half + 4)
	continue;

      /* Print this half.  */

      printf ("\n");
      for (counter = 128 * half; counter < 128 * half + 16; counter++)
	for (counter2 = 0; counter2 < 128; counter2 += 16)
	  {
	    if (counter2 > 0)
	      printf ("  ");

	    code = counter + counter2;
	    printf (format, code);
	    printf (" ");
	    print_symbol (charset, code);

	    if (counter2 == 112)
	      printf ("\n");
	  }
    }
}

/*------------------------------------------------------.
| Print a full CHARSET description on standard output.  |
`------------------------------------------------------*/

void
list_full_charset (CHARSET *charset)
{
  int insert_white;		/* insert a while line before printing */
  int code;			/* code counter */
  const char *symbol;		/* symbol for code */
  const char *charname;		/* charname for code */

  /* Ensure we have a double table to play with.  */

  if (!charset->table)
    error (EXIT_FAILURE, 0,
	   "Sorry, no 10646 names available for `%s'", charset->name);

  /* Print the long table.  */

  printf ("dec  oct hex    ch   %s\n", charset->name);
  insert_white = 1;

  for (code = 0; code < 256; code++)
    if ((symbol = code_to_symbol (charset, code)), symbol)
      {
	if (insert_white)
	  {
	    printf ("\n");
	    insert_white = 0;
	  }
	printf ("%3d  %0.3o  %0.2x    ", code, code, code);
	print_symbol (charset, code);
	if ((charname = symbol_to_charname (symbol)), charname)
	  printf ("   %s", charname);
	printf ("\n");
      }
    else
      insert_white = 1;
}
These are the contents of the former NiCE NeXT User Group NeXTSTEP/OpenStep software archive, currently hosted by Netfuture.ch.