Logo Search packages:      
Sourcecode: ne version File versions  Download package

syn_utf8.c

/* Syntax highlighting from Joe's Own Editor: UTF-8 utilities.

      Copyright (C) 2004 Joseph H. Allen
      Copyright (C) 2009-2011 Todd M. Lewis and Sebastiano Vigna

      This file is part of ne, the nice editor.

      This library is free software; you can redistribute it and/or modify it
      under the terms of the GNU General Public License as published by
      the Free Software Foundation; either version 3 of the License, or (at your
      option) any later version.

      This library is distributed in the hope that it will be useful, but
      WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      for more details.

      You should have received a copy of the GNU General Public License
      along with this program; if not, see <http://www.gnu.org/licenses/>.  */


#include "ne.h"

/* Under AmigaOS we have setlocale() but don't have langinfo.h and associated stuff,
 * so we have to disable the whole piece of code
 */
#ifdef __amigaos
#undef HAVE_SETLOCALE
#endif

/* Cygwin has CODESET, but it's crummy */
#ifdef __CYGWIN__
#undef HAVE_SETLOCALE
#endif

/* If it looks old, forget it */
#ifndef CODESET
#undef HAVE_SETLOCALE
#endif

#if defined(HAVE_LOCALE_H) && defined(HAVE_SETLOCALE)
#     include <locale.h>
#       include <langinfo.h>
#endif

/* UTF-8 Decoder
 *
 * Returns 0 - 7FFFFFFF: decoded character
 *                   -1: character accepted, nothing decoded yet.
 *                   -2: incomplete sequence
 *                   -3: no sequence started, but character is between 128 - 191, 254 or 255
 */

int utf8_decode(struct utf8_sm *utf8_sm,unsigned char c)
{
      if (utf8_sm->state) {
            if ((c&0xC0)==0x80) {
                  utf8_sm->buf[utf8_sm->ptr++] = c;
                  --utf8_sm->state;
                  utf8_sm->accu = ((utf8_sm->accu<<6)|(c&0x3F));
                  if(!utf8_sm->state)
                        return utf8_sm->accu;
            } else {
                  utf8_sm->state = 0;
                  return -2;
            }
      } else if ((c&0xE0)==0xC0) {
            /* 192 - 223 */
            utf8_sm->buf[0] = c;
            utf8_sm->ptr = 1;
            utf8_sm->state = 1;
            utf8_sm->accu = (c&0x1F);
      } else if ((c&0xF0)==0xE0) {
            /* 224 - 239 */
            utf8_sm->buf[0] = c;
            utf8_sm->ptr = 1;
            utf8_sm->state = 2;
            utf8_sm->accu = (c&0x0F);
      } else if ((c&0xF8)==0xF0) {
            /* 240 - 247 */
            utf8_sm->buf[0] = c;
            utf8_sm->ptr = 1;
            utf8_sm->state = 3;
            utf8_sm->accu = (c&0x07);
      } else if ((c&0xFC)==0xF8) {
            /* 248 - 251 */
            utf8_sm->buf[0] = c;
            utf8_sm->ptr = 1;
            utf8_sm->state = 4;
            utf8_sm->accu = (c&0x03);
      } else if ((c&0xFE)==0xFC) {
            /* 252 - 253 */
            utf8_sm->buf[0] = c;
            utf8_sm->ptr = 1;
            utf8_sm->state = 5;
            utf8_sm->accu = (c&0x01);
      } else if ((c&0x80)==0x00) {
            /* 0 - 127 */
            utf8_sm->buf[0] = c;
            utf8_sm->ptr = 1;
            utf8_sm->state = 0;
            return c;
      } else {
            /* 128 - 191, 254, 255 */
            utf8_sm->ptr = 0;
            utf8_sm->state = 0;
            return -3;
      }
      return -1;
}

/* Initialize state machine */

void utf8_init(struct utf8_sm *utf8_sm)
{
      utf8_sm->ptr = 0;
      utf8_sm->state = 0;
}


/* Decode an entire string */

int utf8_decode_string(unsigned char *s)
{
      struct utf8_sm sm;
      int x;
      int c = -1;
      utf8_init(&sm);
      for(x=0;s[x];++x)
            c = utf8_decode(&sm,s[x]);
      return c;
}

/* Decode and advance
 *
 * Returns: 0 - 7FFFFFFF: decoded character
 *  -2: incomplete sequence
 *  -3: bad start of sequence found.
 *
 * p/plen are always advanced in such a way that repeated called to utf8_decode_fwrd do not cause
 * infinite loops.
 */

int utf8_decode_fwrd(unsigned char **p,int *plen)
{
      struct utf8_sm sm;
      unsigned char *s = *p;
      int len;
      int c = -2; /* Return this on no more input. */
      if (plen)
            len = *plen;
      else
            len = -1;

      utf8_init(&sm);

      while (len) {
            c = utf8_decode(&sm, *s);
            if (c >= 0) {
                  /* We've got a character */
                  --len;
                  ++s;
                  break;
            } else if (c == -2) {
                  /* Bad sequence detected.  Caller should feed rest of string in again. */
                  break;
            } else if (c == -3) {
                  /* Bad start of UTF-8 sequence.  We need to eat this char to avoid infinite loops. */
                  --len;
                  ++s;
                  /* But we should tell the caller that something bad was found. */
                  break;
            } else {
                  /* If c is -1, utf8_decode accepted the character, so we should get the next one. */
                  --len;
                  ++s;
            }
      }

      if (plen)
            *plen = len;
      *p = s;

      return c;
}

Generated by  Doxygen 1.6.0   Back to index