using System; using System.Text; namespace SearchEngine { /// /// Summary description for Stemming. /// public interface IStemming { /// /// Returns the stemmed form of a word /// /// The word to stem. It must be capitalized /// The stemmed word string StemWord(string word); } /// /// Performs stemming for English words using Porter's Algorithm. /// public class EnglishStemming: IStemming { private int i, i_end, j, k; private char []b; public EnglishStemming() { b=new char[0]; i=0; i_end=0; } public string StemWord(string word) { i=word.Length; b=word.ToCharArray(); i_end=0; k = i - 1; if (k > 1) { Step1(); Step2(); Step3(); Step4(); Step5(); Step6(); } i_end = k+1; i = 0; return new string(b, 0, i_end); } private bool IsConsonant(int i) { switch(b[i]) { case 'A': case 'E': case 'I': case 'O': case 'U': return false; case 'Y': return (i==0)?true:IsConsonant(i-1); default: return true; } } private int M() { int n = 0, i = 0; while(true) { if (i > j) return n; if (! IsConsonant(i)) break; i++; } i++; while(true) { while(true) { if (i > j) return n; if (IsConsonant(i)) break; i++; } i++; n++; while(true) { if (i > j) return n; if (! IsConsonant(i)) break; i++; } i++; } } private bool ContainsVowel() { for(int pos=0; pos<=j; pos++) { if(!IsConsonant(pos)) { return true; } } return false; } private bool DoubleConsonant(int pos) { if(pos<1) { return false; } if(b[pos]!=b[pos-1]) { return false; } return IsConsonant(pos); } private bool CVC(int pos) { if (pos < 2 || !IsConsonant(pos) || IsConsonant(pos-1) || !IsConsonant(pos-2)) { return false; } int ch = b[i]; if (ch == 'W' || ch == 'X' || ch == 'Y') { return false; } return true; } private bool EndsWith(string s) { int l = s.Length; int o = k-l+1; if (o < 0) return false; for (int i = 0; i < l; i++) if (b[o+i] != s[i]) return false; j = k-l; return true; } private void SetTo(string s) { int l = s.Length; int o = j+1; for (int i = 0; i < l; i++) b[o+i] = s[i]; k = j+l; } private void R(string s) { if(M()>0) { SetTo(s); } } private void Step1() { if (b[k] == 'S') { if (EndsWith("SSES")) k -= 2; else if (EndsWith("IES")) SetTo("I"); else if (b[k-1] != 'S') k--; } if (EndsWith("EED")) { if (M() > 0) k--; } else if ((EndsWith("ED") || EndsWith("ING")) && ContainsVowel()) { k = j; if (EndsWith("AT")) SetTo("ATE"); else if (EndsWith("BL")) SetTo("BLE"); else if (EndsWith("IZ")) SetTo("IZE"); else if (DoubleConsonant(k)) { k--; int ch = b[k]; if (ch == 'L' || ch == 'S' || ch == 'Z') k++; } else if (M() == 1 && CVC(k)) SetTo("E"); } } private void Step2() { if (EndsWith("Y") && ContainsVowel()) b[k] = 'I'; } private void Step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1]) { case 'A': if (EndsWith("ATIONAL")) { R("ATE"); break; } if (EndsWith("TIONAL")) { R("TION"); break; } break; case 'C': if (EndsWith("ENCI")) { R("ENCE"); break; } if (EndsWith("ANCI")) { R("ANCE"); break; } break; case 'E': if (EndsWith("IZER")) { R("IZE"); break; } break; case 'L': if (EndsWith("BLI")) { R("BLE"); break; } if (EndsWith("ALLI")) { R("AL"); break; } if (EndsWith("ENTLI")) { R("ENT"); break; } if (EndsWith("ELI")) { R("E"); break; } if (EndsWith("OUSLI")) { R("OUS"); break; } break; case 'O': if (EndsWith("IZATION")) { R("IZE"); break; } if (EndsWith("ATION")) { R("ATE"); break; } if (EndsWith("ATOR")) { R("ATE"); break; } break; case 'S': if (EndsWith("ALISM")) { R("AL"); break; } if (EndsWith("IVENESS")) { R("IVE"); break; } if (EndsWith("FULNESS")) { R("FUL"); break; } if (EndsWith("OUSNESS")) { R("OUS"); break; } break; case 'T': if (EndsWith("ALITI")) { R("AL"); break; } if (EndsWith("IVITI")) { R("IVE"); break; } if (EndsWith("BILITI")) { R("BLE"); break; } break; case 'G': if (EndsWith("LOGI")) { R("LOG"); break; } break; default : break; } } private void Step4() { switch (b[k]) { case 'E': if (EndsWith("ICATE")) { R("IC"); break; } if (EndsWith("ATIVE")) { R(""); break; } if (EndsWith("ALIZE")) { R("AL"); break; } break; case 'I': if (EndsWith("ICITI")) { R("IC"); break; } break; case 'L': if (EndsWith("ICAL")) { R("IC"); break; } if (EndsWith("FUL")) { R(""); break; } break; case 'S': if (EndsWith("NESS")) { R(""); break; } break; } } private void Step5() { if (k == 0) return; /* for Bug 1 */ switch ( b[k-1] ) { case 'A': if (EndsWith("AL")) break; return; case 'C': if (EndsWith("ANCE")) break; if (EndsWith("ENCE")) break; return; case 'E': if (EndsWith("ER")) break; return; case 'I': if (EndsWith("IC")) break; return; case 'L': if (EndsWith("ABLE")) break; if (EndsWith("IBLE")) break; return; case 'N': if (EndsWith("ANT")) break; if (EndsWith("EMENT")) break; if (EndsWith("MENT")) break; if (EndsWith("ENT")) break; return; case 'O': if (EndsWith("ION") && j >= 0 && (b[j] == 'S' || b[j] == 'T')) break; if (EndsWith("OU")) break; return; case 'S': if (EndsWith("ISM")) break; return; case 'T': if (EndsWith("ATE")) break; if (EndsWith("ITI")) break; return; case 'U': if (EndsWith("OUS")) break; return; case 'V': if (EndsWith("IVE")) break; return; case 'Z': if (EndsWith("IZE")) break; return; default: return; } if (M() > 1) k = j; } private void Step6() { j = k; if (b[k] == 'E') { int a = M(); if (a > 1 || a == 1 && !CVC(k-1)) k--; } if (b[k] == 'L' && DoubleConsonant(k) && M() > 1) k--; } } /// /// Implements a stemming algorithm for greek words /// public class GreekStemming: IStemming { public string StemWord(string word) { int l; l=word.Length; if (l<=3) { return word; } char l1,l2,l3,l4,l5; StringBuilder sbResult = new StringBuilder(word); l1=sbResult[l-1]; l2=sbResult[l-2]; l3=sbResult[l-3]; /* ----------------------- level 1 ------------------------- */ if (l>3) { if (((l1=='Ó') && (l2=='Ï')) || ((l1=='Ó') && (l2=='Ç')) || ((l1=='Ó') && (l2=='Å')) || ((l1=='Í') && (l2=='Ù')) || ((l1=='Õ') && (l2=='Ï')) || ((l1=='É') && (l2=='Ï')) || ((l1=='Ó') && (l2=='Á')) || ((l1=='Ó') && (l2=='Ù')) || ((l1=='É') && (l2=='Á')) || ((l1=='É') && (l2=='Å'))) sbResult[l-2]='\0'; else if ((l1=='Á') || (l1=='Ç') || (l1=='Ï') || (l1=='Å') || (l1=='Ù') || (l1=='É')) sbResult[l-1]='\0'; else if (((l1=='Ó') && (l2=='Õ') && (l3=='Ï')) || ((l1=='Ó') && (l2=='É') && (l3=='Å')) || ((l1=='Í') && (l2=='Õ') && (l3=='Ï'))) sbResult[l-3]='\0'; } /* --------------------- level 2 ------------------------- */ l=sbResult.Length; if (l > 7) { l1=sbResult[l-1]; l2=sbResult[l-2]; l3=sbResult[l-3]; l4=sbResult[l-4]; l5=sbResult[l-5]; if ((l5=='Ï') && (l4=='Õ') && (l3=='Ì') && (l2=='Å') && (l1=='Í')) { sbResult[l-5]='\0'; l-=5; } if ((l5=='Å') && (l4=='Ó') && (l3=='Ô') && (l2=='Å') && (l1=='Ñ')) { sbResult[l-5]='\0'; l-=5; } if ((l5=='É') && (l4=='Ó') && (l3=='Ì') && (l2=='Å') && (l1=='Í')) { sbResult[l-4]='Æ'; sbResult[l-3]='\0'; l-=3; } if ((l5=='Á') && (l4=='Ó') && (l3=='Ì') && (l2=='Å') && (l1=='Í')) { sbResult[l-4]='Æ'; sbResult[l-3]='\0'; l-=3; } if ((l5=='Ç') && (l4=='Ó') && (l3=='Ì') && (l2=='Å') && (l1=='Í')) { sbResult[l-5]='\0'; l-=5; } if ((l5=='É') && (l4=='Ó') && (l3=='Ô') && (l2=='É') && (l1=='Ê')) { sbResult[l-5]='\0'; l-=5; } } if (l > 6) { l1=sbResult[l-1]; l2=sbResult[l-2]; l3=sbResult[l-3]; l4=sbResult[l-4]; if (((l1=='Ô') && (l2=='Ç') && (l3=='Ô') && (l4=='Ï')) || ((l1=='Ä') && (l2=='É') && (l3=='Ô') && (l4=='É')) || ((l1=='Ê') && (l2=='É') && (l3=='Ô') && (l4=='Á')) || ((l1=='Ñ') && (l2=='Å') && (l3=='Ô') && (l4=='Ï')) || ((l1=='Ñ') && (l2=='Å') && (l3=='Ô') && (l4=='Õ')) || ((l1=='Ô') && (l2=='Í') && (l3=='Õ') && (l4=='Ï')) || ((l1=='Ô') && (l2=='Å') && (l3=='Í') && (l4=='É')) || ((l1=='Ô') && (l2=='Å') && (l3=='Ó') && (l4=='Å')) || ((l1=='Ô') && (l2=='Å') && (l3=='Ó') && (l4=='Ç'))) { sbResult[l-4]='\0'; l-=4; } if (((l4=='Ï') && (l3=='Ô') && (l2=='É') && (l1=='Ê')) || ((l4=='Ç') && (l3=='Ô') && (l2=='É') && (l1=='Ê')) || ((l4=='Á') && (l3=='Ô') && (l2=='É') && (l1=='Í'))) { sbResult[l-4]='\0'; l-=4; } if (((l4=='É') && (l3=='Æ') && (l2=='Å') && (l1=='Ô')) || ((l4=='Ç') && (l3=='Ì') && (l2=='Á') && (l1=='Ô'))) { sbResult[l-2]='\0'; l-=2; } if (((l4=='Á') && (l3=='Æ') && (l2=='Å') && (l1=='Ô')) || ((l4=='Á') && (l3=='Ó') && (l2=='Å') && (l1=='Ô'))) { sbResult[l-4]='\0'; l-=4; } if ((l4=='É') && (l3=='Á') && (l2=='Ó') && (l1=='Ì')) { sbResult[l-4]='\0'; l-=4; } if ((l1=='Ô') && (l2=='Å') && (l3=='Ó') && (l4=='É')) { sbResult[l-4]='É'; sbResult[l-3]='Æ'; sbResult[l-2]='\0'; l-=2; } if ((l4=='Ù') && (l3=='Ì') && (l2=='Å') && (l1=='Í')) if ((l-4)>3) { sbResult[l-4]='\0'; l-=4; } else { sbResult[l-3]='\0'; l-=3; } if ((l4=='Ç') && (l3=='Ì') && (l2=='Å') && (l1=='Í')) { sbResult[l-4]='\0'; l-=4; } if ((l4=='Ï') && (l3=='Ì') && (l2=='Å') && (l1=='Í')) { sbResult[l-4]='\0'; l-=4; } if ((l4=='Á') && (l3=='Ì') && (l2=='Å') && (l1=='Í') && (l-4 > 3)) { sbResult[l-4]='\0'; l-=4; } } if (l > 5) { l1=sbResult[l-1]; l2=sbResult[l-2]; l3=sbResult[l-3]; if (((l1=='Ê') && (l2=='Á') && (l3=='É')) || ((l1=='Ä') && (l2=='Ù') && (l3=='É')) || ((l1=='Ô') && (l2=='Í') && (l3=='Ï'))) { sbResult[l-3]='\0'; l-=3; } if ((l3=='Ï') && (l2=='Õ') && (l1=='Ì') && (l-3 > 3)) { sbResult[l-3]='\0'; l-=3; } if ((l3=='Å') && (l2=='É') && (l1=='Ô')) { sbResult[l-3]='\0'; l-=3; } if ((l3=='É') && (l2=='Ó') && (l1=='Ì')) { sbResult[l-2]='Æ'; sbResult[l-1]='\0'; l-=1; } if ((l3=='Á') && (l2=='Ó') && (l1=='Ì')) { sbResult[l-2]='Æ'; sbResult[l-1]='\0'; l-=1; } if ((l3=='Ù') && (l2=='Í') && (l1=='Ô')) { sbResult[l-3]='\0'; l-=3; } if ((l3=='É') && (l2=='Á') && (l1=='Ó')) { sbResult[l-3]='\0'; l-=3; } if (((l3=='Á') && (l2=='Ó') && (l1=='Ô')) || ((l3=='Á') && (l2=='Ó') && (l1=='È'))) { sbResult[l-3]='\0'; l-=3; } if (((l1=='È') && (l2=='Ó') && (l3=='É')) || ((l1=='Ô') && (l2=='Ó') && (l3=='É'))) { sbResult[l-3]='É'; sbResult[l-2]='Æ'; sbResult[l-1]='\0'; l-=1; } if (((l3=='Å') && (l2=='Ó')) && ((l1=='Ô') || (l1=='È'))) { sbResult[l-3]='\0'; l-=3; } if ((((l3=='Ù') && (l2=='Ó')) && ((l1=='Ô') || (l1=='È'))) && (l-3>3)) { sbResult[l-3]='\0'; l-=3; } if ((l3=='É') && (l2=='Ä') && (l1=='É')) { l-=3; if (l > 3) sbResult[l-3]='\0'; else l+=3; } } if (l > 4) { l1=sbResult[l-1]; l2=sbResult[l-2]; if ((l2=='Á') && (l1=='Ô') && (l-2 > 3)) { sbResult[l-2]='\0'; l-=2; } if ((l2=='É') && (l1=='Ä')) { sbResult[l-2]='\0'; l-=2; } if ((l2=='Ç') && (l1=='Ó')) { sbResult[l-2]='\0'; l-=2; } if ((l2=='Ç') && (l1=='È')) { sbResult[l-2]='\0'; l-=2; } if ((l1=='Ó') && (l2=='É') && (l-2 > 3)) sbResult[l-1]='Æ'; if ((l2=='Å') && (l1=='Ô')) { sbResult[l-2]='\0'; l-=2; } if ((l2=='É') && (l1=='Ê') && (l-2 > 3)) { sbResult[l-2]='\0'; l-=2; } if ((l2=='Å') && (l1=='É')) { sbResult[l-2]='\0'; l-=2; } if ((l2=='É') && (l1=='Í') && (l-2 > 3)) { sbResult[l-2]='\0'; l-=2; } if ((l2=='Ù') && ((l1=='Ó') || (l1=='È'))) sbResult[l-1]='Í'; if ((l2=='Á') && (l1=='È') && (l-2 > 3)) { sbResult[l-2]='\0'; l-=2; } if ((l2=='Á') && (l1=='Ó') && (l-2 > 3)) { sbResult[l-2]='\0'; l-=2; } } if (l > 3) { l1=sbResult[l-1]; if ((l1=='É') || (l1=='Å')) { sbResult[l-1]='\0'; l-=1; } } int intIndexZero=0; string strResult = sbResult.ToString(); intIndexZero = strResult.IndexOf("\0"); if (intIndexZero>0) { return strResult.Substring(0,intIndexZero); } else { return strResult; } } } /// /// Performs stemming for English and Greek words. It's a Singleton class. /// You can use this code freely as long as you maintain this reference. /// The English Stemmer has been adapted from the PHP Implementation by /// Jon Abernathy (http://www.chuggnutt.com) /// Author: Kostas Stroggylos, kostas@circular.gr. /// public class Stemming : IStemming { private static Stemming instance; private EnglishStemming en; private GreekStemming gr; private Stemming() { en=new EnglishStemming(); gr=new GreekStemming(); } public static Stemming Instance() { if(instance==null) { instance=new Stemming(); } return instance; } public string StemWord(string word) { return gr.StemWord(en.StemWord(word)); } } }