The software is completely free for any purpose, unless notes at the head of the program text indicates otherwise (which is rare). In any case, the notes about licensing are never more restrictive than the BSD License.
In every case where the software is not written by me (Martin Porter), this licensing arrangement has been endorsed by the contributor, and it is therefore unnecessary to ask the contributor again to confirm it.
I have not asked any contributors (or their employers, if they have them) for proofs that they have the right to distribute their software in this way.
History:
Release 1
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] is then out outside the bounds of b.
Release 2
Similarly,
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and b[j] is then outside the bounds of b.
Release 3
Considerably revised 4/9/00 in the light of many helpful suggestions from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
/** * Stemmer, implementing the Porter Stemming Algorithm * * The Stemmer class transforms a word into its root form. The input * word can be provided a character at time (by calling add()), or at once * by calling one of the various stem(something) methods.
*/
publicclass Stemmer
{ privatechar[] b; privateint i, /* offset into b */
i_end, /* offset to end of stemmed word */
j, k; privatestaticfinalint INC = 50; /* unit of size whereby b is increased */ public Stemmer()
{ b = newchar[INC];
i = 0;
i_end = 0;
}
/** * Add a character to the word being stemmed. When you are finished * adding characters, you can call stem(void) to stem the word.
*/
publicvoid add(char ch)
{ if (i == b.length)
{ char[] new_b = newchar[i+INC]; for (int c = 0; c < i; c++) new_b[c] = b[c];
b = new_b;
}
b[i++] = ch;
}
/** Adds wLen characters to the word being stemmed contained in a portion * of a char[] array. This is like repeated calls of add(char ch), but * faster.
*/
publicvoid add(char[] w, int wLen)
{ if (i+wLen >= b.length)
{ char[] new_b = newchar[i+wLen+INC]; for (int c = 0; c < i; c++) new_b[c] = b[c];
b = new_b;
} for (int c = 0; c < wLen; c++) b[i++] = w[c];
}
/** * After a word has been stemmed, it can be retrieved by toString(), * or a reference to the internal buffer can be retrieved by getResultBuffer * and getResultLength (which is generally more efficient.)
*/ public String toString() { returnnew String(b,0,i_end); }
/** * Returns the length of the word resulting from the stemming process.
*/ publicint getResultLength() { return i_end; }
/** * Returns a reference to a character buffer containing the results of * the stemming process. You also need to consult getResultLength() * to determine the length of the result.
*/ publicchar[] getResultBuffer() { return b; }
/* m() measures the number of consonant sequences between 0 and j. if c is a consonant sequence and v a vowel sequence, and <..> indicates arbitrary presence,
privatefinalint m()
{ int n = 0; int i = 0; while(true)
{ if (i > j) return n; if (! cons(i)) break; i++;
}
i++; while(true)
{ while(true)
{ if (i > j) return n; if (cons(i)) break;
i++;
}
i++;
n++; while(true)
{ if (i > j) return n; if (! cons(i)) break;
i++;
}
i++;
}
}
/* vowelinstem() is true <=> 0,...j contains a vowel */
privatefinalboolean vowelinstem()
{ int i; for (i = 0; i <= j; i++) if (! cons(i)) returntrue; returnfalse;
}
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
privatefinalboolean doublec(int j)
{ if (j < 1) returnfalse; if (b[j] != b[j-1]) returnfalse; return cons(j);
}
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant and also if the second c is not w,x or y. this is used when trying to restore an e at the end of a short word. e.g.
cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
*/
privatefinalboolean cvc(int i)
{ if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) returnfalse;
{ int ch = b[i]; if (ch == 'w' || ch == 'x' || ch == 'y') returnfalse;
} returntrue;
}
privatefinalboolean ends(String s)
{ int l = s.length(); int o = k-l+1; if (o < 0) returnfalse; for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) returnfalse;
j = k-l; returntrue;
}
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
k. */
privatefinalvoid setto(String s)
{ int l = s.length(); int o = j+1; for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
k = j+l;
}
/* r(s) is used further down. */
privatefinalvoid r(String s) { if (m() > 0) setto(s); }
/* step1() gets rid of plurals and -ed or -ing. e.g.
caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
/* step3() maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc. note that the string before the suffix must give
m() > 0. */
privatefinalvoid step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1])
{ case'a': if (ends("ational")) { r("ate"); break; } if (ends("tional")) { r("tion"); break; } break; case'c': if (ends("enci")) { r("ence"); break; } if (ends("anci")) { r("ance"); break; } break; case'e': if (ends("izer")) { r("ize"); break; } break; case'l': if (ends("bli")) { r("ble"); break; } if (ends("alli")) { r("al"); break; } if (ends("entli")) { r("ent"); break; } if (ends("eli")) { r("e"); break; } if (ends("ousli")) { r("ous"); break; } break; case'o': if (ends("ization")) { r("ize"); break; } if (ends("ation")) { r("ate"); break; } if (ends("ator")) { r("ate"); break; } break; case's': if (ends("alism")) { r("al"); break; } if (ends("iveness")) { r("ive"); break; } if (ends("fulness")) { r("ful"); break; } if (ends("ousness")) { r("ous"); break; } break; case't': if (ends("aliti")) { r("al"); break; } if (ends("iviti")) { r("ive"); break; } if (ends("biliti")) { r("ble"); break; } break; case'g': if (ends("logi")) { r("log"); break; }
} }
/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
privatefinalvoid step4() { switch (b[k])
{ case'e': if (ends("icate")) { r("ic"); break; } if (ends("ative")) { r(""); break; } if (ends("alize")) { r("al"); break; } break; case'i': if (ends("iciti")) { r("ic"); break; } break; case'l': if (ends("ical")) { r("ic"); break; } if (ends("ful")) { r(""); break; } break; case's': if (ends("ness")) { r(""); break; } break;
} }
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
privatefinalvoid step5()
{ if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
{ case'a': if (ends("al")) break; return; case'c': if (ends("ance")) break; if (ends("ence")) break; return; case'e': if (ends("er")) break; return; case'i': if (ends("ic")) break; return; case'l': if (ends("able")) break; if (ends("ible")) break; return; case'n': if (ends("ant")) break; if (ends("ement")) break; if (ends("ment")) break; /* element etc. not stripped before the m */ if (ends("ent")) break; return; case'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; /* j >= 0 fixes Bug 2 */ if (ends("ou")) break; return; /* takes care of -ous */ case's': if (ends("ism")) break; return; case't': if (ends("ate")) break; if (ends("iti")) break; return; case'u': if (ends("ous")) break; return; case'v': if (ends("ive")) break; return; case'z': if (ends("ize")) break; return; default: return;
} if (m() > 1) k = j;
}
/* step6() removes a final -e if m() > 1. */
privatefinalvoid step6()
{ j = k; if (b[k] == 'e')
{ int a = m(); if (a > 1 || a == 1 && !cvc(k-1)) k--;
} if (b[k] == 'l' && doublec(k) && m() > 1) k--;
}
/** Stem the word placed into the Stemmer buffer through calls to add(). * Returns true if the stemming process resulted in a word different * from the input. You can retrieve the result with * getResultLength()/getResultBuffer() or toString().
*/ publicvoid stem()
{ k = i - 1; if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
i_end = k+1; i = 0;
}
/** Test program for demonstrating the Stemmer. It reads text from a * a list of files, stems each word, and writes the result to standard * output. Note that the word stemmed is expected to be in lower case: * forcing lower case must be done outside the Stemmer class. * Usage: Stemmer file-name file-name ...
*/ publicstaticvoid main(String[] args)
{ char[] w = newchar[501];
Stemmer s = new Stemmer(); for (int i = 0; i < args.length; i++) try
{
FileInputStream in = new FileInputStream(args[i]);
try
{ while(true)
{ int ch = in.read(); if (Character.isLetter((char) ch))
{ int j = 0; while(true)
{ ch = Character.toLowerCase((char) ch);
w[j] = (char) ch; if (j < 500) j++;
ch = in.read(); if (!Character.isLetter((char) ch))
{ /* to test add(char ch) */ for (int c = 0; c < j; c++) s.add(w[c]);
/* or, to test add(char[] w, int j) */ /* s.add(w, j); */
s.stem();
{ String u;
/* and now, to test toString() : */
u = s.toString();
/* to test getResultBuffer(), getResultLength() : */ /* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.