diff options
Diffstat (limited to 'contrib/tsearch/dict/porter_english.dct')
-rw-r--r-- | contrib/tsearch/dict/porter_english.dct | 1289 |
1 files changed, 0 insertions, 1289 deletions
diff --git a/contrib/tsearch/dict/porter_english.dct b/contrib/tsearch/dict/porter_english.dct deleted file mode 100644 index 6c472298284..00000000000 --- a/contrib/tsearch/dict/porter_english.dct +++ /dev/null @@ -1,1289 +0,0 @@ -/* - * ----START-LICENCE---- - * Copyright 1999,2000 BrightStation PLC - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 - * USA - * -----END-LICENCE----- - */ -/* Version 1: see http://open.muscat.com/ for further information */ - - -#ifdef DICT_BODY -#include <ctype.h> /* tolower */ - -static void * setup_english_stemmer(void); - -static const char * english_stem(void * z, const char * q, int i0, int i1); - -static void closedown_english_stemmer(void * z); - - -/* To set up the english stemming process: - - void * z = setup_stemmer(); - - to use it: - - char * p = stem(z, q, i0, i1); - - The word to be stemmed is in byte address q offsets i0 to i1 - inclusive (i.e. from q[i0] to q[i1]). The stemmed result is the - C string at address p. - - To close down the stemming process: - - closedown_stemmer(z); - -*/ - -/* The English stemming algorithm is essentially the Porter stemming - * algorithm, and has been coded up by its author. It follows the algorithm - * presented in - * - * Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, - * no. 3, pp 130-137, - * - * only differing from it at the points marked -DEPARTURE- and -NEW- - * below. - * - * For a more faithful version of the Porter algorithm, see - * - * http://www.muscat.com/~martin/stem.html - * - */ - -/* Later additions: - - June 2000 - - The 'l' of the 'logi' -> 'log' rule is put with the stem, so that - short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc. - - This follows a suggestion of Barry Wilkins, reasearch student at - Birmingham. - - - February 2000 - - the cvc test for not dropping final -e now looks after vc at the - beginning of a word, so are, eve, ice, ore, use keep final -e. In this - test c is any consonant, including w, x and y. This extension was - suggested by Chris Emerson. - - -fully -> -ful treated like -fulness -> -ful, and - -tionally -> -tion treated like -tional -> -tion - - both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi. - - Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh. - -*/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -struct pool { - - int size; - struct pool_entry * entries; - -}; - -/* This is used as a library to resolve exceptions in the various - stemming algorithms. Typical use is, - - struct pool * p = create_pool(t); - char * s_translated = search_pool(p, strlen(s), s); - ... - free_pool(p); - - t is an array of strings, e.g. - - static char * t[] = { - - "sky", "sky/skies/", - "die", "dying/", - "lie", "lying/", - "tie", "tying/", - .... - 0, 0 - - }; - - if s is "sky", "skies", "dying" etc., translated_s is becomes "sky", - "sky", "die" etc. - - The code includes a sort/merge capability which may be turned into - (or replaced by) something more general later on. - -*/ - -/* merge(n, p, q, r, l, k, f) repeatedly merges n-byte sequences of items of - size k from addresses p and q into r. f is the comparison routine and - l is the limit point for q. -*/ - -static void merge(int n, char * p, char * q, char * r, char * l, int k, - int (*f)(char *, char *)) -{ char * q0 = q; - if (q0 > l) { memmove(r, p, l-p); return; } - while (p < q0) - { char * pl = n+p; - char * ql = n+q; - if (ql > l) ql = l; - while(true) - { if (p >= pl) { memmove(r, q, ql-q); r += ql-q; q = ql; break; } - if (q >= ql) { memmove(r, p, pl-p); r += pl-p; p = pl; break; } - if (f(p, q)) { memmove(r, p, k); p += k; } - else { memmove(r, q, k); q += k; } - r += k; - } - } - memmove(r, q, l-q); -} - -/* In sort(p, c, k, f), p+c is a byte address at which begin a sequence of - items of size k to be sorted. p+l is the address of the byte after the - last of these items, so l - c is divisible by k. f is a comparison function - for a pair of these items: f(p+i, q+j) is true if the item at p+i is before - the item at q+j, false if it is equal to or after it. -*/ - -static void sort(char * p, int c, int l, int k, - int (*f)(char *, char *)) -{ - char * q = malloc(l-c); /* temporary work space */ - int j = k; - int w = l-c; - while (j < w) - { int cycle; - for (cycle = 1; cycle <= 2; cycle++) - { int h = (w+j-1) / j / 2 * j; /* half way */ - if (cycle == 1) merge(j, p+c, p+c+h, q, p+l, k, f); - else merge(j, q, q+h, p+c, q+w, k, f); - j *= 2; - } - } - free(q); -} - -struct pool_entry { - - const char * translation; - const char * pointer; - int length; - -}; - -static void print_entry(struct pool_entry * p) - { - { int j; for (j=0;j<p->length;j++) fprintf(stderr, "%c", (p->pointer)[j]); } - fprintf(stderr, " --> %s\n", p->translation); - } - -/* - debugging aid - static void print_pool(struct pool * p) - { int i; - int size = p->size; - struct pool_entry * q = p->entries; - fprintf(stderr, "\nPool:\n"); - for (i = 0; i < size; i++) print_entry(q+i); - } -*/ - -/* compare(p, q) is our comparison function, used for f above -*/ - -static int compare(char * char_p, char * char_q) -{ struct pool_entry * p = (struct pool_entry *) char_p; - struct pool_entry * q = (struct pool_entry *) char_q; - if (p->length == q->length) return memcmp(p->pointer, q->pointer, p->length) < 0; - return p->length < q->length; -} - -static int count_slashes(const char * s[]) -{ int slash_count = 0; - int i; - for (i = 1; s[i] != 0; i += 2) - { const char * p = s[i]; - int j = 0; - while (p[j] != 0) if (p[j++] == '/') slash_count++; - } - return slash_count; -} - -static struct pool * create_pool(const char * s[]) -{ int size = count_slashes(s); - struct pool_entry * z = (struct pool_entry *) malloc(size * sizeof(struct pool_entry)); - struct pool_entry * q = z; - int i; - for (i = 1; s[i] != 0; i += 2) - { const char * p = s[i]; - int j = 0; - int j0 = 0; - while(true) - { if (p[j] == 0) - { if (j0 != j) { fprintf(stderr, "%s lacks final '/'\n", p); exit(1); } - break; - } - if (p[j] == '/') - { - q->translation = s[i-1]; - q->pointer = p+j0; q->length = j-j0; - q++; - j0 = j+1; - } - j++; - } - } - sort((char *) z, 0, size * sizeof(struct pool_entry), sizeof(struct pool_entry), compare); - - /* now validate the contents */ - - for (i = 1; i < size; i++) - { struct pool_entry * p = z+i; - struct pool_entry * q = z+i-1; - if (p->length == q->length && memcmp(p->pointer, q->pointer, p->length) == 0) - { fprintf(stderr, "warning: "); print_entry(p); - fprintf(stderr, " and "); print_entry(q); - } - } - - { struct pool * p = (struct pool *) malloc(sizeof(struct pool)); - p->entries = z; - p->size = size; - return p; - } -} - -static int compare_to_pool(int length, const char * s, int length_p, const char * s_p) -{ if (length != length_p) return length-length_p; - return memcmp(s, s_p, length); -} - -static const char * search_pool(struct pool * p, int length, char * s) -{ int i = 0; - int j = p->size; - struct pool_entry * q = p->entries; - if (j == 0) return 0; /* empty pool */ - if (compare_to_pool(length, s, q->length, q->pointer) < 0) return 0; - while(true) - { - int h = (i+j)/2; - int diff = compare_to_pool(length, s, (q+h)->length, (q+h)->pointer); - if (diff == 0) return (q+h)->translation; - if (j-i <= 1) return 0; - if (diff < 0) j = h; else i = h; - } -} - -static void free_pool(struct pool * p) -{ free(p->entries); - free(p); -} - -struct english_stemmer -{ - char * p; - int p_size; - int k; - int j; - struct pool * irregulars; -}; - -/* The main part of the stemming algorithm starts here. z->p is a buffer - holding a word to be stemmed. The letters are in z->p[0], z->p[1] ... - ending at z->p[z->k]. z->k is readjusted downwards as the stemming - progresses. Zero termination is not in fact used in the algorithm. - - Note that only lower case sequences are stemmed. Forcing to lower case - should be done before english_stem(...) is called. - - We will write p, k etc in place of z->p, z->k in the comments. -*/ - -/* cons(z, i) is true <=> p[i] is a consonant. -*/ - -static int cons(struct english_stemmer * z, int i) -{ switch (z->p[i]) - { case 'a': case 'e': case 'i': case 'o': case 'u': - return false; - case 'y': - return (i==0) ? true : !cons(z, i - 1); - default: return true; - } -} - -/* m(z) measures the number of consonant sequences between 0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - <c><v> gives 0 - <c>vc<v> gives 1 - <c>vcvc<v> gives 2 - <c>vcvcvc<v> gives 3 - .... -*/ - -static int m(struct english_stemmer * z) -{ int n = 0; - int i = 0; - while(true) - { if (i > z->j) return n; - if (! cons(z, i)) break; i++; - } - i++; - while(true) - { while(true) - { if (i > z->j) return n; - if (cons(z, i)) break; - i++; - } - i++; - n++; - while(true) - { if (i > z->j) return n; - if (! cons(z, i)) break; - i++; - } - i++; - } -} - -/* vowelinstem(z) is true p[0], ... p[j] contains a vowel -*/ - -static int vowelinstem(struct english_stemmer * z) -{ int i; - for (i = 0; i <= z->j; i++) if (! cons(z, i)) return true; - return false; -} - -/* doublec(z, i) is true <=> p[i], p[i - 1] contain a double consonant. -*/ - -static int doublec(struct english_stemmer * z, int i) -{ if (i < 1) return false; - if (z->p[i] != z->p[i - 1]) return false; - return cons(z, i); -} - -/* cvc(z, i) is true <=> - - a) ( -NEW- ) i == 1, and p[0] p[1] is vowel consonant, or - - b) p[i - 2], p[i - 1], p[i] has the form consonant - - vowel - consonant and also if the second c is not w, x or y. this is used - when trying to restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - -*/ - -static int cvc(struct english_stemmer * z, int i) -{ - if (i == 0) return false; /* i == 0 never happens perhaps */ - - if (i == 1) return !cons(z, 0) && cons(z, 1); - - if (!cons(z, i) || cons(z, i - 1) || !cons(z, i - 2)) return false; - { int ch = z->p[i]; - if (ch == 'w' || ch == 'x' || ch == 'y') return false; - } - return true; -} - -/* ends(z, s, length) is true <=> p[0], ... p[k] ends with the string s. -*/ - -static int ends(struct english_stemmer * z, const char * s, int length) -{ - if (length > z->k + 1) return false; - if (memcmp(z->p + z->k - length + 1, s, length) != 0) return false; - z->j = z->k - length; - return true; -} - -/* setto(z, s, length) sets p[j + 1] ... to the characters in the string s, - readjusting k. -*/ - -static void setto(struct english_stemmer * z, const char * s, int length) -{ - memmove(z->p + z->j + 1, s, length); - z->k = z->j + length; -} - -/* r(z, s, length) is used further down. */ - -static void r(struct english_stemmer * z, const char * s, int length) -{ - if (m(z) > 0) setto(z, s, length); -} - -/* step_1ab(z) gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - sties -> sti - tie -> tie (-NEW-: see below) - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - -*/ - -static void step_1ab(struct english_stemmer * z) -{ if (z->p[z->k] == 's') - { if (ends(z, "sses", 4)) z->k -= 2; else - if (ends(z, "ies", 3)) - if (z->j == 0) z->k--; else z->k -= 2; - - /* this line extends the original algorithm, so that 'flies'->'fli' but - 'dies'->'die' etc */ - - else - if (z->p[z->k - 1] != 's') z->k--; - } - - if (ends(z, "ied", 3)) { if (z->j == 0) z->k--; else z->k -= 2; } else - - /* this line extends the original algorithm, so that 'spied'->'spi' but - 'died'->'die' etc */ - - if (ends(z, "eed", 3)) { if (m(z) > 0) z->k--; } else - if ((ends(z, "ed", 2) || ends(z, "ing", 3)) && vowelinstem(z)) - { z->k = z->j; - if (ends(z, "at", 2)) setto(z, "ate", 3); else - if (ends(z, "bl", 2)) setto(z, "ble", 3); else - if (ends(z, "iz", 2)) setto(z, "ize", 3); else - if (doublec(z, z->k)) - { z->k--; - { int ch = z->p[z->k]; - if (ch == 'l' || ch == 's' || ch == 'z') z->k++; - } - } - else if (m(z) == 1 && cvc(z, z->k)) setto(z, "e", 1); - } -} - -/* step_1c(z) turns terminal y to i when there is another vowel in the stem. - - -NEW-: This has been modified from the original Porter algorithm so that y->i - is only done when y is preceded by a consonant, but not if the stem - is only a single consonant, i.e. - - (*c and not c) Y -> I - - So 'happy' -> 'happi', but - 'enjoy' -> 'enjoy' etc - - This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'-> - 'enjoy'. Step 1c is perhaps done too soon; but with this modification that - no longer really matters. - - Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly', - 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried', - 'flies' ... - -*/ - -static void step_1c(struct english_stemmer * z) -{ - if (ends(z, "y", 1) && z->j > 0 && cons(z, z->k - 1)) z->p[z->k] = 'i'; -} - - -/* step_2(z) maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc. Note that the string before the suffix must give - m(z) > 0. -*/ - -static void step_2(struct english_stemmer * z) -{ switch (z->p[z->k - 1]) - { - case 'a': - if (ends(z, "ational", 7)) { r(z, "ate", 3); break; } - if (ends(z, "tional", 6)) { r(z, "tion", 4); break; } - break; - case 'c': - if (ends(z, "enci", 4)) { r(z, "ence", 4); break; } - if (ends(z, "anci", 4)) { r(z, "ance", 4); break; } - break; - case 'e': - if (ends(z, "izer", 4)) { r(z, "ize", 3); break; } - break; - case 'l': - if (ends(z, "bli", 3)) { r(z, "ble", 3); break; } /*-DEPARTURE-*/ - - /* To match the published algorithm, replace this line with - case 'l': - if (ends(z, "abli", 4)) { r(z, "able", 4); break; } - */ - if (ends(z, "alli", 4)) - { - if (m(z) > 0) { setto(z, "al", 2); step_2(z); } /*-NEW-*/ - break; - } - - if (ends(z, "fulli", 5)) { r(z, "ful", 3); break; } /*-NEW-*/ - if (ends(z, "entli", 5)) { r(z, "ent", 3); break; } - if (ends(z, "eli", 3)) { r(z, "e", 1); break; } - if (ends(z, "ousli", 5)) { r(z, "ous", 3); break; } - break; - case 'o': - if (ends(z, "ization", 7)) { r(z, "ize", 3); break; } - if (ends(z, "ation", 5)) { r(z, "ate", 3); break; } - if (ends(z, "ator", 4)) { r(z, "ate", 3); break; } - break; - case 's': - if (ends(z, "alism", 5)) { r(z, "al", 2); break; } - if (ends(z, "iveness", 7)) { r(z, "ive", 3); break; } - if (ends(z, "fulness", 7)) { r(z, "ful", 3); break; } - if (ends(z, "ousness", 7)) { r(z, "ous", 3); break; } - break; - case 't': - if (ends(z, "aliti", 5)) { r(z, "al", 2); break; } - if (ends(z, "iviti", 5)) { r(z, "ive", 3); break; } - if (ends(z, "biliti", 6)) { r(z, "ble", 3); break; } - break; - case 'g': - if (ends(z, "logi", 4)) - { z->j++; /*-NEW-*/ /*(Barry Wilkins)*/ - r(z, "og", 2); break; - } /*-DEPARTURE-*/ - - /* To match the published algorithm, delete this line */ - - } -} - -/* step_3(z) deals with -ic-, -full, -ness etc. Similar strategy to step_2. -*/ - -static void step_3(struct english_stemmer * z) -{ switch (z->p[z->k]) - { - case 'e': - if (ends(z, "icate", 5)) { r(z, "ic", 2); break; } - if (ends(z, "ative", 5)) { r(z, "", 0); break; } - if (ends(z, "alize", 5)) { r(z, "al", 2); break; } - break; - case 'i': - if (ends(z, "iciti", 5)) { r(z, "ic", 2); break; } - break; - case 'l': - if (ends(z, "ical", 4)) { r(z, "ic", 2); break; } - if (ends(z, "ful", 3)) { r(z, "", 0); break; } - break; - case 's': - if (ends(z, "ness", 4)) { r(z, "", 0); break; } - break; - } -} - -/* step_4() takes off -ant, -ence etc., in context <c>vcvc<v>. -*/ - -static void step_4(struct english_stemmer * z) -{ switch (z->p[z->k - 1]) - { case 'a': - if (ends(z, "al", 2)) break; return; - case 'c': - if (ends(z, "ance", 4)) break; - if (ends(z, "ence", 4)) break; return; - case 'e': - if (ends(z, "er", 2)) break; return; - case 'i': - if (ends(z, "ic", 2)) break; return; - case 'l': - if (ends(z, "able", 4)) break; - if (ends(z, "ible", 4)) break; return; - case 'n': - if (ends(z, "ant", 3)) break; - if (ends(z, "ement", 5)) break; - if (ends(z, "ment", 4)) break; - if (ends(z, "ent", 3)) break; return; - case 'o': - if (ends(z, "ion", 3) && (z->p[z->j] == 's' || - z->p[z->j] == 't')) break; - if (ends(z, "ou", 2)) break; return; - /* takes care of -ous */ - case 's': - if (ends(z, "ism", 3)) break; return; - case 't': - if (ends(z, "ate", 3)) break; - if (ends(z, "iti", 3)) break; return; - case 'u': - if (ends(z, "ous", 3)) break; return; - case 'v': - if (ends(z, "ive", 3)) break; return; - case 'z': - if (ends(z, "ize", 3)) break; return; - default: - return; - } - if (m(z) > 1) z->k = z->j; -} - -/* step_5(z) removes a final -e if m(z) > 1, and changes -ll to -l if - m(z) > 1. -*/ - -static void step_5(struct english_stemmer * z) -{ z->j = z->k; - if (z->p[z->k] == 'e') - { int a = m(z); - if (a > 1 || (a == 1 && !cvc(z, z->k - 1))) z->k--; - } - if (z->p[z->k] == 'l' && doublec(z, z->k) && m(z) > 1) z->k--; -} - -static const char * english_stem(void * z_, const char * q, int i0, int i1) -{ - struct english_stemmer * z = (struct english_stemmer *) z_; - int p_size = z->p_size; - - if (i1 - i0 + 50 > p_size) - { free(z->p); - p_size = i1 - i0 + 75; /* ample */ z->p_size = p_size; - z->p = (char *) malloc(p_size); - } - - memmove(z->p, q + i0, i1 - i0 + 1); - - z->k = i1 - i0; - - - { const char * t = search_pool(z->irregulars, z->k + 1, z->p); - if (t != 0) { - z->k = strlen(t) - 1; - return t; - } - } - - if (z->k > 1) /*-DEPARTURE-*/ - - /* With this line, strings of length 1 or 2 don't go through the - stemming process, although no mention is made of this in the - published algorithm. Remove the line to match the published - algorithm. */ - - { step_1ab(z); step_1c(z); - step_2(z); - step_3(z); - step_4(z); - step_5(z); - } - - z->p[z->k + 1] = 0; /* C string form for now */ - return z->p; -} - -/* -NEW- - This is a table of irregular forms. It is quite short, but still - reflects the errors actually drawn to Martin Porter's attention over - a 20 year period! - - Extend it as necessary. - - The form of the table is: - - "p1" "s11/s12/s13/ ... /" - "p2" "s21/s22/s23/ ... /" - ... - "pn" "sn1/sn2/sn3/ ... /" - 0, 0 - - String sij is mapped to paradigm form pi, and the main stemming - process is then bypassed. -*/ - -static const char * irregular_forms[] = { - - "sky", "sky/skies/", - "die", "dying/", - "lie", "lying/", - "tie", "tying/", - "news", "news/", - "inning", "innings/inning/", - "outing", "outings/outing/", - "canning", "cannings/canning/", - "howe", "howe/", - - /*-NEW-*/ - "proceed", "proceed/", - "exceed", "exceed/", - "succeed", "succeed/", /* Hiranmay Ghosh */ - - 0, 0 /* terminator */ - -}; - - -/* - * is_stopword part - */ -typedef struct { - unsigned char val; - unsigned char flag; - unsigned char right; - - unsigned char child; -} ESWNODE; - -/* is exists left tree ? */ -#define L 0x01 -/* finish word flag */ -#define F 0x02 -#define ISLEFT(x) (((ESWNODE*)x)->flag & L) -#define ISFINISH(x) (((ESWNODE*)x)->flag & F) - - -static ESWNODE engstoptree[] = { - {'m',L,9,126}, - {'d',L,4,71}, - {'b',L,2,40}, - {'a',F,0,14}, - {'c',0,0,62}, - {'f',L,2,79}, - {'e',0,0,75}, - {'h',0,1,90}, - {'i',F,0,108}, - {'t',L,4,177}, - {'o',L,2,135}, - {'n',0,0,131}, - {'s',0,0,156}, - {'v',L,2,210}, - {'u',0,0,201}, - {'w',0,1,211}, - {'y',0,0,237}, - - {'m',L|F,5,0}, - {'f',L,2,12}, - {'b',0,0,7}, - {'g',0,1,13}, - {'l',0,0,17}, - {'r',L,2,19}, - {'n',F,0,16}, - {'s',F,1,0}, - {'t',F,0,0}, - - {'o',0,0,1}, - - {'u',0,1,2}, - {'v',F,0,0}, - - {'t',F,0,0}, - - {'t',0,0,1}, - - {'e',0,0,1}, - - {'r',F,0,0}, - - {'a',0,0,1}, - - {'i',0,0,1}, - - {'n',F,0,1}, - - {'s',0,0,1}, - - {'t',F,0,0}, - - {'l',F,0,0}, - - {'d',F,1,0}, - {'i',F,0,0}, - - {'e',F,0,0}, - - {'o',L,2,21}, - {'e',F,0,3}, - {'u',0,1,21}, - {'y',F,0,0}, - - {'f',L,3,9}, - {'c',0,1,4}, - {'e',0,0,6}, - {'l',0,1,8}, - {'t',0,0,9}, - - {'a',0,0,1}, - - {'u',0,0,1}, - - {'s',F,0,0}, - - {'n',F,0,0}, - - {'o',0,0,1}, - - {'r',F,0,0}, - - {'o',0,0,1}, - - {'w',F,0,0}, - - {'w',0,0,1}, - - {'e',0,0,1}, - - {'e',0,0,1}, - - {'n',F,0,0}, - - {'t',0,0,1}, - - {'h',F,0,0}, - - {'t',F,0,0}, - - {'a',0,1,2}, - {'o',0,0,2}, - - {'n',F,0,0}, - - {'u',0,0,1}, - - {'l',0,0,1}, - - {'d',F,0,0}, - - {'o',L|F,2,4}, - {'i',0,0,2}, - {'u',0,0,5}, - - {'d',F,0,0}, - - {'e',F,1,0}, - {'w',0,0,1}, - - {'n',F,0,0}, - - {'r',0,0,1}, - - {'e',F,0,0}, - - {'a',0,0,1}, - - {'c',0,0,1}, - - {'h',F,0,0}, - - {'o',L,2,5}, - {'e',0,0,3}, - {'r',0,1,4}, - {'u',0,0,5}, - - {'w',F,0,0}, - - {'r',F,0,0}, - - {'o',0,0,1}, - - {'m',F,0,0}, - - {'r',0,0,1}, - - {'t',0,0,1}, - - {'h',0,0,1}, - - {'e',0,0,1}, - - {'r',F,0,0}, - - {'e',L|F,2,7}, - {'a',F,0,3}, - {'i',F,1,11}, - {'o',0,0,15}, - - {'d',F,1,0}, - {'v',0,0,1}, - - {'e',F,0,0}, - - {'r',F,0,1}, - - {'e',F,1,0}, - {'s',0,0,1}, - - {'e',0,0,1}, - - {'l',0,0,1}, - - {'f',F,0,0}, - - {'m',F,0,1}, - - {'s',0,0,1}, - - {'e',0,0,1}, - - {'l',0,0,1}, - - {'f',F,0,0}, - - {'w',F,0,0}, - - {'n',L|F,2,4}, - {'f',F,0,0}, - {'s',F,1,0}, - {'t',F,0,3}, - - {'t',0,0,1}, - - {'o',F,0,0}, - - {'s',0,0,1}, - - {'e',0,0,1}, - - {'l',0,0,1}, - - {'f',F,0,0}, - - {'o',L,3,6}, - {'a',0,1,4}, - {'e',F,0,0}, - {'u',0,1,7}, - {'y',F,0,8}, - - {'y',F,0,0}, - - {'r',0,1,2}, - {'s',0,0,2}, - - {'e',F,0,0}, - - {'t',F,0,0}, - - {'s',0,0,1}, - - {'t',F,0,0}, - - {'s',0,0,1}, - - {'e',0,0,1}, - - {'l',0,0,1}, - - {'f',F,0,0}, - - {'o',F,0,1}, - - {'r',F,1,0}, - {'t',F,0,0}, - - {'t',L,4,11}, - {'n',L|F,2,7}, - {'f',F,0,5}, - {'r',F,0,0}, - {'v',L,2,16}, - {'u',0,0,9}, - {'w',0,0,16}, - - {'f',F,0,0}, - - {'c',F,1,0}, - {'l',0,0,1}, - - {'i',F,0,0}, - - {'h',0,0,1}, - - {'e',0,0,1}, - - {'r',F,0,0}, - - {'r',F,1,2}, - {'t',F,0,0}, - - {'s',0,0,1}, - - {'e',0,0,1}, - - {'l',0,0,1}, - - {'v',F,0,0}, - - {'e',0,0,1}, - - {'r',F,0,0}, - - {'n',F,0,0}, - - {'h',L,2,6}, - {'a',0,0,3}, - {'o',F,1,12}, - {'u',0,0,13}, - - {'m',0,0,1}, - - {'e',F,0,0}, - - {'e',L|F,2,0}, - {'a',0,0,2}, - {'o',0,0,3}, - - {'l',0,0,1}, - - {'l',F,0,0}, - - {'u',0,0,1}, - - {'l',0,0,1}, - - {'d',F,0,0}, - - {'m',0,0,1}, - - {'e',F,0,0}, - - {'c',0,0,1}, - - {'h',F,0,0}, - - {'h',0,1,2}, - {'o',F,0,27}, - - {'i',L|F,3,0}, - {'a',0,1,4}, - {'e',F,0,5}, - {'o',0,1,17}, - {'r',0,0,18}, - - {'n',F,1,0}, - {'t',F,0,0}, - - {'n',L|F,3,0}, - {'i',0,1,5}, - {'m',F,0,5}, - {'s',L,2,9}, - {'r',0,0,7}, - {'y',F,0,0}, - - {'r',F,0,0}, - - {'s',0,0,1}, - - {'e',0,0,1}, - - {'l',0,0,1}, - - {'v',F,0,0}, - - {'e',F,0,0}, - - {'e',F,0,0}, - - {'s',0,0,1}, - - {'e',F,0,0}, - - {'o',0,0,1}, - - {'u',0,0,1}, - - {'g',0,0,1}, - - {'h',F,0,0}, - - {'o',F,0,0}, - - {'n',0,1,2}, - {'p',F,0,0}, - - {'d',0,1,2}, - {'t',0,0,3}, - - {'e',0,0,1}, - - {'r',F,0,0}, - - {'i',0,0,1}, - - {'l',F,0,0}, - - {'e',0,0,1}, - - {'r',0,0,1}, - - {'i',F,0,0}, - - {'h',L,3,7}, - {'a',F,1,0}, - {'e',F,0,3}, - {'i',0,1,17}, - {'o',0,0,20}, - - {'r',0,0,1}, - - {'e',F,0,0}, - - {'e',L,2,5}, - {'a',0,0,3}, - {'i',F,1,6}, - {'o',F,0,9}, - - {'t',F,0,0}, - - {'n',F,1,0}, - {'r',0,0,1}, - - {'e',F,0,0}, - - {'c',0,1,2}, - {'l',0,0,2}, - - {'h',F,0,0}, - - {'e',F,0,0}, - - {'m',F,0,0}, - - {'l',0,1,2}, - {'t',0,0,2}, - - {'l',F,0,0}, - - {'h',F,0,0}, - - {'u',0,0,1}, - - {'l',0,0,1}, - - {'d',F,0,0}, - - {'o',0,0,1}, - - {'u',F,0,1}, - - {'r',F,0,1}, - - {'s',0,0,1}, - - {'e',0,0,1}, - - {'l',0,0,1}, - - {'f',F,1,0}, - {'v',F,0,0} -}; - -static unsigned int -find_english_stopword( unsigned char *buf, int len ) { - ESWNODE *ptr = engstoptree; - int result = 0; - unsigned char *cur = buf; - - while( cur - buf < len ) { - if ( ptr->val == *cur ) { - cur++; - if ( ISFINISH(ptr) ) result = cur - buf; - if ( ! ptr->child ) break; - ptr += ptr->child; - } else if ( ptr->val > *cur ) { - if ( ISLEFT(ptr) ) - ptr++; - else - break; - } else { - if ( ptr->right ) - ptr += ptr->right; - else - break; - } - } - return result; -} - -#undef L -#undef F -#undef ISLEFT -#undef ISFINISH - -static int -is_stopengword(void* obj,char* word,int len) -{ - return ( len == find_english_stopword((unsigned char*)word, len) ) ? 1 : 0; -} - -static void * -setup_english_stemmer(void) -{ - struct english_stemmer * z = (struct english_stemmer *) malloc(sizeof(struct english_stemmer)); - z->p = 0; z->p_size = 0; - z->irregulars = create_pool(irregular_forms); - return (void *) z; -} - -static void -closedown_english_stemmer(void * z_) -{ - struct english_stemmer * z = (struct english_stemmer *) z_; - free_pool(z->irregulars); - free(z->p); - free(z); -} - -static char* -engstemming(void* obj, char *word, int *len) -{ - struct english_stemmer * z = (struct english_stemmer *) obj; - const char* stemmed_word; - char *result = word; - - while(result-word < *len) { - *result = tolower((unsigned char) *result); - result++; - } - stemmed_word = english_stem(obj, word, 0, *len-1); - *len = z->k + 1; - - result = (char*)palloc( *len ); - memcpy((void*)result, (void*)stemmed_word, *len); - return result; -} -#endif /* DICT_BODY */ - -#ifdef DICT_TABLE -TABLE_DICT_START - "C", - setup_english_stemmer, - closedown_english_stemmer, - engstemming, - NULL, - is_stopengword -TABLE_DICT_END -#endif - |