Logo Search packages:      
Sourcecode: ne version File versions  Download package

search.c

/* Search/replace functions (with and without regular expressions).

      Copyright (C) 1993-1998 Sebastiano Vigna 
      Copyright (C) 1999-2005 Todd M. Lewis and Sebastiano Vigna

      This file is part of ne, the nice editor.

      This program is free software; you can redistribute it and/or modify it
      under the terms of the GNU General Public License as published by the
      Free Software Foundation; either version 2, or (at your option) any
      later version.
      
      This program is distributed in the hope that it will be useful, but
      WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      General Public License for more details.
      
      You should have received a copy of the GNU General Public License along
      with this program; see the file COPYING.  If not, write to the Free
      Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
      02111-1307, USA.  */


#include "ne.h"
#include "regex.h"


/* This is the initial allocation size for regex.library. */

#define START_BUFFER_SIZE 4096


/* This array is used both by the Boyer-Moore algorithm and by the regex
library. It is updated if b->find_string_changed is TRUE (it should always
be the first time the string is searched for). */

static unsigned int d[256];

/* This macro upper cases a character or not, depending on the boolean
sense_case. It is used in find(). Note that the argument *MUST* be unsigned. */

#define CONV(c)   (sense_case ? c : up_case[c])


/* This vector is a translation table for the regex library which maps
lower case characters to upper case characters. It's normally adjusted
on startup according to the current locale. */

unsigned char localised_up_case[256];

/* This vector is a translation table for the regex library which maps
ASCII lower case characters to upper case characters. */

const unsigned char ascii_up_case[256] = {
      0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
      0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
      0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
      0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
      0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
      0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
      0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
      0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
      0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
      0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
      0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
      0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
      0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
      0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
      0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
};



/* Performs a search for the given pattern with a simplified Boyer-Moore
   algorithm starting at the given position, in the given direction, skipping a
   possible match at the current cursor position if skip_first is TRUE. If dir
   is 0, it is fetched from b->opt.search_back. If pattern is NULL, it is
   fetched from b->find_string. In this case, b->find_string_changed is
   checked, and, if it is FALSE, the string is not recompiled. Please check to
   set b->find_string_changed whenever a new string is set in
   b->find_string. The cursor is moved on the occurrence position if a match is
   found. */

int find(buffer * const b, const unsigned char *pattern, int dir, int skip_first) {

      const unsigned char * const up_case = b->encoding == ENC_UTF8 ? ascii_up_case : localised_up_case;
      const int sense_case = (b->opt.case_search != 0);
      unsigned char c, first_char;
      unsigned char *p;
      long i, m;
      int recompile_string;
      long y;
      line_desc *ld;

      if (!pattern) {
            pattern = b->find_string;
            recompile_string = b->find_string_changed || b->last_was_regexp;
      }
      else recompile_string = TRUE;

      if (!dir) dir = b->opt.search_back ? -1 : 1;

      if (!pattern || !(m = strlen(pattern))) return ERROR;

      if (recompile_string) for(i = 0; i < sizeof d / sizeof *d; i++) d[i] = m;

      ld = b->cur_line_desc;
      y = b->cur_line;

      if (dir > 0) {

            if (recompile_string) {
                  for(i = 0; i < m - 1; i++) d[CONV(pattern[i])] = m - i-1;
                  b->find_string_changed = 0;
            }

            p = ld->line + b->cur_pos + m - 1 + (skip_first ? 1 : 0);
            first_char = CONV(pattern[m - 1]);

            while(y < b->num_lines) {

                  assert(ld->ld_node.next != NULL);

                  if (ld->line_len >= m) {

                        while((p - ld->line) < ld->line_len) {

                              if ((c = CONV(*p)) != first_char) p+=d[c];
                              else {
                                    for (i = 1; i < m; i++)
                                          if (CONV(*(p - i)) != CONV(pattern[m - i-1])) {
                                                p+=d[c];
                                                break;
                                          }
                                    if (i == m) {
                                          goto_line(b, y);
                                          goto_pos(b, (p - ld->line) - m + 1);
                                          return OK;
                                    }
                              }
                        }
                  }

                  ld = (line_desc *)ld->ld_node.next;
                  if (ld->ld_node.next) p = ld->line + m-1;
                  y++;
            }
      }
      else {

            if (recompile_string) {
                  for(i = m - 1; i > 0; i--) d[(unsigned char)CONV(pattern[i])] = i;
                  b->find_string_changed = 0;
            }

            p = ld->line + (b->cur_pos > ld->line_len - m ? ld->line_len - m : b->cur_pos + (skip_first ? -1 : 0));
            first_char = CONV(pattern[0]);

            while(y >= 0) {

                  assert(ld->ld_node.prev != NULL);

                  if (ld->line_len >= m) {

                        while((p - ld->line) >= 0) {

                              if ((c = CONV(*p)) != first_char) p-=d[c];
                              else {
                                    for (i = 1; i < m; i++)
                                          if (CONV(*(p + i)) != CONV(pattern[i])) {
                                                p-=d[c];
                                                break;
                                          }
                                    if (i == m) {
                                          goto_line(b, y);
                                          goto_pos(b, p - ld->line);
                                          return OK;
                                    }
                              }
                        }
                  }

                  ld = (line_desc *)ld->ld_node.prev;
                  if (ld->ld_node.prev) p = ld->line + ld->line_len - m;
                  y--;
            }
      }

      return NOT_FOUND;
}



/* Replaces n characters with the given string at the current cursor position,
   and then moves it to the end of the string. */

int replace(buffer * const b, const int n, const char * const string) {

      int len;

      assert(string != NULL);

      len = strlen(string);

      start_undo_chain(b);

      delete_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos, n);

      if (len) insert_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos, string, len);

      end_undo_chain(b);

      goto_pos(b, b->cur_pos + len);

      return OK;
}



/* The following variables are used by regex. In particular, re_reg holds
the stard/end of the extended replacement registers. */

static struct re_pattern_buffer re_pb;
static struct re_registers re_reg;

/* This string is used to replace the dot in UTF-8 searches. It will match only
 whole UTF-8 sequences. */

#define UTF8DOT "([\x01-\x7F\xC0-\xFF][\x80-\xBF]*)"

/* This string is prefixed to a complemented character class to force matches
      against UTF-8 non-US-ASCII characters.  It will match any UTF-8 sequence of
      length at least two, besides all characters expressed by the character
      class. Note that a closing ] and a closing ) must be appended. */

#define UTF8COMP "([\xC0-\xFF][\x80-\xBF]+|[^"

/* This string is used to replace non-word-constituents (\W) in UTF-8
 searches. It will match only whole UTF-8 sequences of non-word-constituent
 characters. */

#define UTF8NONWORD "([\x01-\x1E\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]|[\xC0-\xFF][\x80-\xBF]+)"

/* In UTF-8 text, the numbering of a parenthesised group may differ from the
   "official" one, due to the usage of parenthesis in UTF8DOT, UT8COMP and
   UTF8NONWORD. This array records for each user-invoked group the
   corresponding (usually larger) regex group. The group may be larger than
   RE_NREGS, in which case there is no way to recover it. */

static int map_group[RE_NREGS];

/* Works exactly like find(), but uses the regex library instead. */

int find_regexp(buffer * const b, const unsigned char *regex, int dir, int skip_first) {

      const unsigned char * const up_case = b->encoding == ENC_UTF8 ? ascii_up_case : localised_up_case;
      const unsigned char *p;
      int recompile_string, i, y, start_pos;
      line_desc *ld;

      if (!regex) {
            regex = b->find_string;
            recompile_string = b->find_string_changed || !b->last_was_regexp;
      }
      else recompile_string = TRUE;

      if (!dir) dir = b->opt.search_back ? -1 : 1;

      if (!regex || !strlen(regex)) return ERROR;

      if (re_pb.buffer == NULL) {
            if (re_pb.buffer = malloc(START_BUFFER_SIZE)) re_pb.allocated = START_BUFFER_SIZE;
            else return OUT_OF_MEMORY;
      }

      re_pb.fastmap = (void *)d;

      /* We have to be careful: even if the search string has not changed, it
      is possible that case sensitivity has. In this case, we force recompilation. */

      if (b->opt.case_search) {
            if (re_pb.translate != 0) recompile_string = TRUE;
            re_pb.translate = 0;
      }
      else {
            if (re_pb.translate != (char *)up_case) recompile_string = TRUE;
            re_pb.translate = (char *)up_case;
      }

      if (recompile_string) {
            const unsigned char *actual_regex = regex;

            /* If the buffer encoding is UTF-8, we need to replace dots with UTF8DOT,
                  non-word-constituents (\W) with UTF8NONWORD, and embed complemented
                  character classes in UTF8COMP, so that they do not match UTF-8
                  subsequences. Moreover, we must compute the remapping from the virtual
                  to the actual groups caused by the new groups thus introduced. */

            if (b->encoding == ENC_UTF8) {
                  const unsigned char *s;
                  unsigned char *q;
                  int virtual_group = 0, real_group = 0, escape = FALSE, dots = 0, comps = 0, nonwords = 0;

                  s = regex;

                  /* We first scan regex to compute the exact number of characters of
                        the actual (i.e., after substitutions) regex. */

                  do {
                        if (!escape) {
                              if (*s == '.') dots++;
                              else if (*s == '[') {
                                    if (*(s+1) == '^') {
                                          comps++;
                                          s++;
                                    }

                                    if (*(s+1) == ']') s++; /* A literal ]. */

                                    /* We scan the list up to ] and check that no non-US-ASCII characters appear. */
                                    do if (utf8len(*(++s)) != 1) return UTF8_REGEXP_CHARACTER_CLASS_NOT_SUPPORTED; while(*s && *s != ']');
                              }
                              else if (*s == '\\') {
                                    escape = TRUE;
                                    continue;
                              }
                        }
                        else if (*s == 'W') nonwords++;
                        escape = FALSE;
                  } while(*(++s));

                  actual_regex = q = malloc(strlen(regex) + 1 + (strlen(UTF8DOT) - 1) * dots + (strlen(UTF8NONWORD) - 2) * nonwords + (strlen(UTF8COMP) - 1) * comps);
                  if (!actual_regex) return OUT_OF_MEMORY;
                  s = regex;
                  escape = FALSE;

                  do {
                        if (escape || *s != '.' && *s != '(' && *s != '[' && *s != '\\') {
                              if (escape && *s == 'W') {
                                    q--;
                                    strcpy(q, UTF8NONWORD);
                                    q += strlen(UTF8NONWORD);
                                    real_group++;
                              }
                              else *(q++) = *s;
                        }
                        else {
                              if (*s == '\\') {
                                    escape = TRUE;
                                    *(q++) = '\\';
                                    continue;
                              }

                              if (*s == '.') {
                                    strcpy(q, UTF8DOT);
                                    q += strlen(UTF8DOT);
                                    real_group++;
                              }
                              else if (*s == '(') {
                                    *(q++) = '(';
                                    if (virtual_group < RE_NREGS - 1) map_group[++virtual_group] = ++real_group;
                              }
                              else if (*s == '[') {
                                    if (*(s+1) == '^') {
                                          strcpy(q, UTF8COMP);
                                          q += strlen(UTF8COMP);
                                          s++;
                                          if (*(s+1) == ']') *(q++) = *(++s); /* A literal ]. */
                                          do    *(q++) = *(++s); while (*s && *s != ']');
                                          if (*s) *(q++) = ')';
                                          real_group++;
                                    }
                                    else {
                                          *(q++) = '[';
                                          if (*(s+1) == ']') *(q++) = *(++s); /* A literal ]. */
                                          do    *(q++) = *(++s); while (*s && *s != ']');
                                    }
                              }
                        }

                        escape = FALSE;
                  } while(*(s++));
                  
                  /* This assert may be false if a [ is not closed. */
                  assert(strlen(actual_regex) == strlen(regex) + (strlen(UTF8DOT) - 1) * dots + (strlen(UTF8NONWORD) - 2) * nonwords + (strlen(UTF8COMP) - 1) * comps);
            }

            p = re_compile_pattern(actual_regex, strlen(actual_regex), &re_pb);

            if (b->encoding == ENC_UTF8) free((void*)actual_regex);

            if (p) {
                  /* Here we have a very dirty hack: since we cannot return the error of
                        regex, we print it here. Which means that we access term.c's
                        functions. 8^( */
                  print_message(p);
                  ring_bell();
                  return ERROR;
            }

      }

      b->find_string_changed = 0;

      ld = b->cur_line_desc;
      y = b->cur_line;

      if (dir > 0) {

            start_pos = b->cur_pos + (skip_first ? 1 : 0);

            while(y < b->num_lines) {

                  assert(ld->ld_node.next != NULL);

                  if (start_pos <= ld->line_len &&
                         (i = re_search(&re_pb, ld->line ? ld->line : (unsigned char *)"", ld->line_len, start_pos, ld->line_len - start_pos, &re_reg))>=0) {
                        goto_line(b, y);
                        goto_pos(b, i);
                        return OK;
                  }

                  ld = (line_desc *)ld->ld_node.next;
                  start_pos = 0;
                  y++;
            }
      }
      else {

            start_pos = b->cur_pos + (skip_first ? -1 : 0);

            while(y >= 0) {

                  assert(ld->ld_node.prev != NULL);

                  if (start_pos >= 0 &&
                         (i =re_search(&re_pb, ld->line ? ld->line : (unsigned char *)"", ld->line_len, start_pos, -start_pos - 1, &re_reg))>=0) {
                        goto_line(b, y);
                        goto_pos(b, i);
                        return OK;
                  }

                  ld = (line_desc *)ld->ld_node.prev;
                  if (ld->ld_node.prev) start_pos = ld->line_len;
                  y--;
            }
      }

      return NOT_FOUND;
}



/* Replaces a regular expression. The given string can contain \0, \1 etc. for
   the pattern matched by the i-th pair of brackets (\0 is the whole
   string). */

int replace_regexp(buffer * const b, const char * const string) {

      int i, pos, len, c, reg_used = FALSE;
      unsigned char *p, *q, *t = NULL;

      assert(string != NULL);

      if (q = p = str_dup(string)) {

            len = strlen(p);

            while(TRUE) {
                  while(*q && *q != '\\') q++;

                  if (!*q) break;

                  i = *(q + 1) - '0';

                  if (*(q + 1) == '\\') {
                        memmove(q, q + 1, strlen(q + 1) + 1);
                        q++;
                        len--;
                  }
                  else if (i >= 0 && i < RE_NREGS && re_reg.start[i] >= 0) {
                        if (b->encoding == ENC_UTF8) {
                              /* In the UTF-8 case, the replacement group index must be
                                    mapped through map_group to recover the real group. */
                              if ((i = map_group[i]) >= RE_NREGS) {
                                    free(p);
                                    return GROUP_NOT_AVAILABLE;
                              }
                        }
                        *q++ = 0;
                        *q++ = i;
                        reg_used = TRUE;
                  }
                  else {
                        free(p);
                        return WRONG_CHAR_AFTER_BACKSLASH;
                  }
            }

            if (reg_used) {
                  if (t = malloc(re_reg.end[0] - re_reg.start[0] + 1)) {
                        memcpy(t, b->cur_line_desc->line + re_reg.start[0], re_reg.end[0] - re_reg.start[0]);
                        t[re_reg.end[0] - re_reg.start[0]] = 0;
                  }
                  else {
                        free(p);
                        return OUT_OF_MEMORY;
                  }
            }

            for(i = RE_NREGS - 1; i >= 0; i--) {
                  re_reg.end[i] -= re_reg.start[0];
                  re_reg.start[i] -= re_reg.start[0];
            }

            start_undo_chain(b);

            delete_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos, re_reg.end[0]);

            q = p;
            pos = 0;

            while(TRUE) {
                  if (strlen(q)) {
                        insert_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos + pos, q, strlen(q));
                        pos += strlen(q);
                  }

                  q += strlen(q) + 1;

                  if (q - p > len) break;

                  if (re_reg.end[*q] - re_reg.start[*q]) {

                        c = t[re_reg.end[*q]];
                        t[re_reg.end[*q]] = 0;

                        insert_stream(b, b->cur_line_desc, b->cur_line, b->cur_pos + pos, t + re_reg.start[*q], re_reg.end[*q] - re_reg.start[*q]);

                        t[re_reg.end[*q]] = c;

                        pos += re_reg.end[*q] - re_reg.start[*q];
                  }

                  q++;
            }

            end_undo_chain(b);

            goto_pos(b, b->cur_pos + pos);
            if (pos == 0 && re_reg.end[0] == 0) char_right(b);

            free(t);
            free(p);
      }
      else return OUT_OF_MEMORY;

      return OK;
}

Generated by  Doxygen 1.6.0   Back to index