deb-goldedplus/chsgen/chsgen.cc

using namespace std;

#include <cctype>
#include <cstring>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <iomanip>

typedef struct {
  char name[256];
  int replacement[3];
} unicodechar;

typedef int charset[256];

charset from_charset, to_charset;
unicodechar unicodedata[65536], alt_unicodedata[65536];

static int atox(const char *str)
{
  unsigned int result = 0;
  int i = 0;

  while(isxdigit(str[i]))
    {
       result <<= 4;
       result |= isdigit(str[i]) ? (str[i] - '0') : (toupper(str[i]) - 'A' + 10);
       i++;
    }
  return (int)result;
}

static void load_charset(const char *fn, charset &recode)
{
  int chr;
  for(chr = 0; chr < 128; chr++)
    recode[chr] = chr;
  for(chr = 128; chr < 256; chr++)
    recode[chr] = 0;

  ifstream file(fn);
  if(file)
    {
      char str[1024];
      while(!file.eof())
        {
          str[0] = '\0';
          file.getline(str, sizeof(str));
          if((str[0] == '\0') || (str[0] == '#'))
            continue;
          if(str[0] == '0')
            chr = atox(str + 2);
          else if(str[0] == '=')
            chr = atox(str + 1);
          else
            {
              cerr << "error in input file (string: \"" << str << '\"' << endl;
              continue;
            }
          char *ustr = strstr(str, "U+");
          if(ustr == NULL)
            ustr = strstr(str, "u+");
          if(ustr == NULL)
            {
              cerr << "error in input file (string: \"" << str << '\"' << endl;
              continue;
            }
          recode[chr] = atox(ustr + 2);
        }
    }
  else
    {
      cerr << "error opening file " << fn << endl;
    }
}

static int lookup_char(int unicode_char)
{
  int i;
  bool found = false;
  for(i = 0; i < 256; i++)
    {
      if(to_charset[i] == unicode_char)
        {
          found = true;
          break;
        }
    }
  if(!found)
    {
      i = -1;
    }
  return i;
}

static int lookup_char_by_name(const char *unicode_name)
{
  int i;
  bool found = false;
  for(i = 0; i < 256; i++)
    {
      if(strcmp(unicodedata[to_charset[i]].name, unicode_name) == 0)
        {
          found = true;
          break;
        }
    }
  if(!found)
    {
      i = -1;
    }
  return i;
}

static char *mystrtok(char *str, char *delim)
{
  static char *last = NULL;

  if(str == NULL)
    str = last;

  if(str == NULL)
    return str;

  last = strpbrk(str, delim);
  if(last != NULL)
    {
      *last = '\0';
      last++;
    }
  return str;
}

static void load_unicodedata(const char *fn, unicodechar *udata)
{
  memset(udata, 0, sizeof(unicodechar) * 65536);

  ifstream file(fn);
  if(file)
    {
      int chr;
      char str[1024];
      while(!file.eof())
        {
          str[0] = '\0';
          file.getline(str, sizeof(str));
          if((str[0] == '\0') || (str[0] == '#'))
            continue;
          char *tok = mystrtok(str, ";");
          if(tok == NULL)
            continue;
          chr = atox(tok);
          if(chr > 65536)
            continue;
          tok = mystrtok(NULL, ";");
          if(tok == NULL)
            continue;
          strncpy(udata[chr].name, tok, sizeof(udata[chr].name));
          udata[chr].name[sizeof(udata[chr].name) - 1] = '\0';
          tok = mystrtok(NULL, ";");
          if(tok == NULL)
            continue;
          tok = mystrtok(NULL, ";");
          if(tok == NULL)
            continue;
          tok = mystrtok(NULL, ";");
          if(tok == NULL)
            continue;
          tok = mystrtok(NULL, ";");
          if(tok == NULL)
            continue;
          if((tok[0] != '\0') && !isxdigit(tok[0]))
            {
              tok = strchr(tok, '>');
              if(tok == NULL)
                {
                  continue;
                }
              while((tok[0] != '\0') && !isxdigit(tok[0]))
                {
                  tok++;
                }
            }
          if(tok[0] != '\0')
            {
              udata[chr].replacement[0] = atox(tok);
              while(isxdigit(tok[0]))
                {
                  tok++;
                }
            }
          while((tok[0] != '\0') && !isxdigit(tok[0]))
            {
              tok++;
            }
          if(tok[0] != '\0')
            {
              udata[chr].replacement[1] = atox(tok);
              while(isxdigit(tok[0]))
                {
                  tok++;
                }
            }
          while((tok[0] != '\0') && !isxdigit(tok[0]))
            {
              tok++;
            }
          if(tok[0] != '\0')
            {
              udata[chr].replacement[2] = atox(tok);
            }
        }
    }
  else
    {
      cerr << "error opening file " << fn << endl;
    }
}

static void generate_table(const char *cp1, const char *cp2)
{
  char incp[256], outcp[256], fn[sizeof(incp) + sizeof(outcp) + 4];
  const char *slash = strrchr(cp1, '/');
  const char *backslash = strrchr(cp1, '\\');
  if((slash != NULL) && (backslash != NULL))
    {
      if(slash < backslash)
        slash = backslash;
    }
  else if(slash == NULL)
    slash = backslash;
  strncpy(incp, (slash == NULL) ? cp1 : (slash + 1), sizeof(incp));
  incp[sizeof(incp) - 1] = '\0';
  char *dot = strchr(incp, '.');
  if(dot != NULL)
    dot[0] = '\0';

  slash = strrchr(cp2, '/');
  backslash = strrchr(cp2, '\\');
  if((slash != NULL) && (backslash != NULL))
    {
      if(slash < backslash)
        slash = backslash;
    }
  else if(slash == NULL)
    slash = backslash;
  strncpy(outcp, (slash == NULL) ? cp2 : (slash + 1), sizeof(outcp));
  outcp[sizeof(outcp) - 1] = '\0';
  dot = strchr(outcp, '.');
  if(dot != NULL)
    dot[0] = '\0';

  strcpy(fn, incp);
  strcat(fn, "_");
  strcat(fn, outcp);
  strcat(fn, ".chs");

  ofstream file(fn);
  if(file)
    {
      int chr = 0;

      file << ";" << endl
        << "; This file is a charset conversion module in text form." << endl
        << ";" << endl
        << "; Automatically generated." << endl
        << ";" << endl
        << "0" << endl
        << "0" << endl
        << ";" << endl;

      if(memcmp(from_charset, to_charset, 128 * sizeof(int)) == 0)
        chr = 128;

      file << ((chr == 128) ? 2 : 1) << endl
        << incp << "\t; from charset" << endl
        << outcp << "\t; to charset" << endl
        << ";" << endl;

      for(; chr < 256; chr++)
        {
          int i;

          if((chr != 0) && (from_charset[chr] == 0))
            {
              file << "\\0 ?" << "\t; not defined" << endl;
              continue;
            }
          // try exact match first
          i = lookup_char(from_charset[chr]);
          if(i != -1)
            {
              file << "\\0 \\x" << hex << i << "\t; "
                << unicodedata[from_charset[chr]].name << endl;
              continue;
            }
          // if not found exact match try to compose
          int replacement_char1 = unicodedata[from_charset[chr]].replacement[0];
          if(replacement_char1 != 0)
            {
              i = lookup_char(replacement_char1);
              if(i != -1)
                {
                  int replacement_char2 = unicodedata[from_charset[chr]].replacement[1];
                  if(replacement_char2 == 0)
                    {
                      file << "\\0 \\x" << hex << i << "\t; "
                        << unicodedata[replacement_char1].name << endl;
                      continue;
                    }
                  int j = lookup_char(replacement_char2);
                  if(j != -1)
                    {
                      int replacement_char3 = unicodedata[from_charset[chr]].replacement[2];
                      if(replacement_char3 == 0)
                        {
                          file << "\\x" << hex << i << " \\x" << hex << j << "\t; "
                            << unicodedata[replacement_char1].name << " + "
                            << unicodedata[replacement_char2].name << endl;
                          continue;
                        }
                      int k = lookup_char(replacement_char3);
                      if(k != -1)
                        {
                          file << "\\x" << hex << i << " \\x" << hex << j << " \\x" << hex << k << "\t; "
                            << unicodedata[replacement_char1].name << " + "
                            << unicodedata[replacement_char2].name << " + "
                            << unicodedata[replacement_char3].name << endl;
                          continue;
                        }
                    }
                }
            }
          // if unable to compose through the standart data try our own
          replacement_char1 = alt_unicodedata[from_charset[chr]].replacement[0];
          if(replacement_char1 != 0)
            {
              i = lookup_char(replacement_char1);
              if(i != -1)
                {
                  int replacement_char2 = alt_unicodedata[from_charset[chr]].replacement[1];
                  if(replacement_char2 == 0)
                    {
                      file << "\\0 \\x" << hex << i << "\t; "
                        << unicodedata[replacement_char1].name << endl;
                      continue;
                    }
                  int j = lookup_char(replacement_char2);
                  if(j != -1)
                    {
                      int replacement_char3 = alt_unicodedata[from_charset[chr]].replacement[2];
                      if(replacement_char3 == 0)
                        {
                          file << "\\x" << hex << i << " \\x" << hex << j << "\t; "
                            << unicodedata[replacement_char1].name << " + "
                            << unicodedata[replacement_char2].name << endl;
                          continue;
                        }
                      int k = lookup_char(replacement_char3);
                      if(k != -1)
                        {
                          file << "\\x" << hex << i << " \\x" << hex << j << " \\x" << hex << k << "\t; "
                            << unicodedata[replacement_char1].name << " + "
                            << unicodedata[replacement_char2].name << " + "
                            << unicodedata[replacement_char3].name << endl;
                          continue;
                        }
                    }
                }
            }
          // if not found exact match and unable to compose try to remove
          // macrons, circumflexes, etc.
          const char *letter;
          const char *name = unicodedata[from_charset[chr]].name;
          if((letter = strstr(name, "LETTER ")) != NULL)
            {
              letter += 7;
              const char *letterend = strchr(letter, ' ');
              char lettercopy[256];
              int letterlen = ((letterend == NULL) ? strlen(name) : (letterend - name)) + 1;
              if(letterlen > sizeof(lettercopy))
                letterlen = sizeof(lettercopy);
              strncpy(lettercopy, unicodedata[from_charset[chr]].name, letterlen);
              lettercopy[letterlen - 1] = '\0';
              i = lookup_char_by_name(lettercopy);
              if(i != -1)
                {
                  file << "\\0 \\x" << hex << i << "\t; " << lettercopy;
                  if(letterend)
                    file << " [" << (letterend + 1) << "]";
                  file << endl;
                  continue;
                }
            }
          file << "\\0 ?" << "\t; (" << unicodedata[from_charset[chr]].name << ")" << endl;
        }

      file << "END" << endl;
    }
  else
    {
      cerr << "error opening file " << fn << endl;
    }
}

int main(int ac, char *av[])
{
  if(ac != 3)
    {
      cout << "usage: " << av[0] << " from_charset to_charset" << endl;
      exit(-1);
    }
  load_charset(av[1], from_charset);
  load_charset(av[2], to_charset);
  load_unicodedata("UnicodeData.txt", unicodedata);
  load_unicodedata("AltUnicodeData.txt", alt_unicodedata);
  generate_table(av[1], av[2]);
  return 0;
}