UTF-32 <-> UTF-8 conversion code.

Nothing is better than coding at night!
This was born at ~6 am.
This should also be integrated into Baghdad to drop the Glibmm dependency.
/* unicode.c
 * Copyright (c) 2005 Mohammed Sameer.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
/*
 * To compile: gcc -o unicode unicode.c
 * ./unicode ~/projects/katoob/misc/tests/utf
 */
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#define ERR(x) {fprintf(stderr,"%s\n",x); exit(1);}
int validate(unsigned char *buff, int len);
void dump_unicode(unsigned char *buff, int len);
unsigned long *to_unicode(unsigned char *utf8, int len);
unsigned char *to_utf8(unsigned long *unicode);
void print_char(int pos, int len, unsigned long ch);
void dump_unicode_string(unsigned long *str);
int get_length(unsigned char ch);
int main(int argc, char *argv[])
{
  FILE *fp;
  struct stat buf;
  int x;
  unsigned char *buff;
  unsigned long *unicode = NULL;
  if (argc != 2)
    {
      ERR("Usage unicode <file>");
    }
  x = stat(argv[1], &buf);
  if (x != 0)
    {
      perror("stat");
      exit(1);
    }
  buff = (unsigned char *)malloc(buf.st_size +1);
  if (!buff)
    ERR("Not enough memory!");
  fp = fopen(argv[1], "r");
  if (!fp)
    {
      perror("fopen");
      exit(1);
    }
  x = fread(buff, 1, buf.st_size, fp);
  if (x != buf.st_size)
    {
      perror("fread");
      exit(1);
    }
  buff[x] = '\0';
  fclose(fp);
  if (validate(buff, x))
    {
      unicode = to_unicode(buff, x);
      dump_unicode_string(unicode);
      //      dump_unicode(buff, x);
    }
  else {
    ERR("Invalid UTF-8 strings.");
  }
  //  printf("%s",buff);
  free(buff);
  buff = to_utf8(unicode);
  if (unicode)
    free(unicode);
  printf("%s\n", buff);
  free(buff);
  return 0;
}
int validate(unsigned char *buff, int len)
{
  int x;
  for (x = 0; x < len; x++)
    {
      if (buff[x] > 0xfd)
	{
	  printf("Byte %i is invalid\n", x);
	  return 0;
	}
    }
  return 1;
}
int get_length(unsigned char ch)
{
  int l;
  unsigned char c = ch;
  c >>= 3;
  // 6 => 0x7e
  // 5 => 0x3e
  if (c == 0x1e)
    {
      l = 4;
    }
      else {
	c >>= 1;
	if (c == 0xe)
	  {
	    l = 3;
	  }
	else {
	  c >>= 1;
	  if (c == 0x6)
	    {
	      l = 2;
	    }
	  else {
	    l = 1;
	  }
	}
      }
  return l;
}
unsigned long *to_unicode(unsigned char *utf8, int len)
{
  unsigned char *p = utf8;
  unsigned long ch;
  int x = 0;
  int l;
  unsigned long *result = (unsigned long *)malloc(sizeof(unsigned long)*len);
  unsigned long *r = result;
  if (!result)
    {
      ERR("Ran out of memory!");
    }
  while (*p)
    {
      l = get_length(*p);
      switch (l)
	{
	case 4:
	  ch = (*p ^ 0xf0);
	  break;
	case 3:
	  ch = (*p ^ 0xe0);
	  break;
	case 2:
	  ch = (*p ^ 0xc0);
	  break;
	case 1:
	  ch = *p;
	  break;
	default:
	  printf("Len: %i\n", l);
	}
      ++p;
      int y;
      for (y = l; y > 1; y--)
	{
	  ch <<= 6;
	  ch |= (*p ^ 0x80);
	  ++p;
	}
      print_char(x,l,ch);
      x += l;
      *r = ch;
      r++;
    }
  *r = 0x0;
  return result;
}
unsigned char *to_utf8(unsigned long *unicode)
{
  unsigned char *utf8 = NULL;
  unsigned long *s = unicode;
  unsigned char *u;
  unsigned long ch;
  int x = 0;
  while (*s)
    {
      ++s;
      ++x;
    }
  if (x == 0)
    {
      return NULL;
    }
  utf8 = (unsigned char *)malloc(x*4);
  if (!utf8)
    ERR("Out of memory");
  s = unicode;
  u = utf8;
  x = 0;
  while (*s)
    {
      ch = *s;
      if (*s < 0x80)
	{
	  x = 1;
	  *u = *s;
	  u++;
	}
      else if (*s < 0x800)
	{
	  x = 2;
	  *u = 0xc0 | (ch >> 6);
	  u++;
	}
      else if (*s < 0x10000)
	{
	  x = 3;
	  *u = 0xe0 | (ch >> 12);
	  u++;
	}
      else if (*s < 0x200000)
	{
	  x = 4;
	  *u = 0xf0 | (ch >> 18);
	  u++;
	}
      if (x > 1)
	{
	  int y;
	  for (y = x; y > 1; y--)
	    {
	      /*
	      unsigned long mask = 0x3f << ((y-2)*6);
	      *u = 0x80 | (ch & mask);
	      */
	      *u = 0x80 | (ch & (0x3f << ((y-2)*6)));
	      ++u;
	    }
	}
      ++s;
    }
  return utf8;
}
void dump_unicode(unsigned char *buff, int len)
{
  unsigned long *result = to_unicode(buff, len);
  dump_unicode_string(result);
  free(result);
}
void dump_unicode_string(unsigned long *str)
{
  unsigned long *s = str;
  while (*s)
    {
      printf("%li %lx\n", *s, *s);
      s++;
    }
  printf("\n");
}
void print_char(int pos, int len, unsigned long ch)
{
  printf("Character: %i\tLength: %i\tUTF-32(hex): %lx\tUTF-32(dec): %li\n", pos, len, ch, ch);
}
Coding and hacking
FLOSS
16454 views
UTF-32 <-> UTF-8 conversion code.

Add new comment