en_stemmer.js 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
  2. // paper, in
  3. //
  4. // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  5. // no. 3, pp 130-137,
  6. //
  7. // see also http://www.tartarus.org/~martin/PorterStemmer
  8. // Release 1 be 'andargor', Jul 2004
  9. // Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
  10. define(function () {
  11. return (function () {
  12. var step2list = {
  13. "ational": "ate",
  14. "tional": "tion",
  15. "enci": "ence",
  16. "anci": "ance",
  17. "izer": "ize",
  18. "bli": "ble",
  19. "alli": "al",
  20. "entli": "ent",
  21. "eli": "e",
  22. "ousli": "ous",
  23. "ization": "ize",
  24. "ation": "ate",
  25. "ator": "ate",
  26. "alism": "al",
  27. "iveness": "ive",
  28. "fulness": "ful",
  29. "ousness": "ous",
  30. "aliti": "al",
  31. "iviti": "ive",
  32. "biliti": "ble",
  33. "logi": "log"
  34. },
  35. step3list = {
  36. "icate": "ic",
  37. "ative": "",
  38. "alize": "al",
  39. "iciti": "ic",
  40. "ical": "ic",
  41. "ful": "",
  42. "ness": ""
  43. },
  44. c = "[^aeiou]", // consonant
  45. v = "[aeiouy]", // vowel
  46. C = c + "[^aeiouy]*", // consonant sequence
  47. V = v + "[aeiou]*", // vowel sequence
  48. mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
  49. meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
  50. mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
  51. s_v = "^(" + C + ")?" + v; // vowel in stem
  52. return function (w) {
  53. var stem,
  54. suffix,
  55. firstch,
  56. re,
  57. re2,
  58. re3,
  59. re4,
  60. origword = w;
  61. if (w.length < 3) {
  62. return w;
  63. }
  64. firstch = w.substr(0, 1);
  65. if (firstch == "y") {
  66. w = firstch.toUpperCase() + w.substr(1);
  67. }
  68. // Step 1a
  69. re = /^(.+?)(ss|i)es$/;
  70. re2 = /^(.+?)([^s])s$/;
  71. if (re.test(w)) {
  72. w = w.replace(re, "$1$2");
  73. }
  74. else if (re2.test(w)) {
  75. w = w.replace(re2, "$1$2");
  76. }
  77. // Step 1b
  78. re = /^(.+?)eed$/;
  79. re2 = /^(.+?)(ed|ing)$/;
  80. if (re.test(w)) {
  81. var fp = re.exec(w);
  82. re = new RegExp(mgr0);
  83. if (re.test(fp[1])) {
  84. re = /.$/;
  85. w = w.replace(re, "");
  86. }
  87. } else if (re2.test(w)) {
  88. var fp = re2.exec(w);
  89. stem = fp[1];
  90. re2 = new RegExp(s_v);
  91. if (re2.test(stem)) {
  92. w = stem;
  93. re2 = /(at|bl|iz)$/;
  94. re3 = new RegExp("([^aeiouylsz])\\1$");
  95. re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
  96. if (re2.test(w)) {
  97. w = w + "e";
  98. }
  99. else if (re3.test(w)) {
  100. re = /.$/;
  101. w = w.replace(re, "");
  102. }
  103. else if (re4.test(w)) {
  104. w = w + "e";
  105. }
  106. }
  107. }
  108. // Step 1c
  109. re = /^(.+?)y$/;
  110. if (re.test(w)) {
  111. var fp = re.exec(w);
  112. stem = fp[1];
  113. re = new RegExp(s_v);
  114. if (re.test(stem)) {
  115. w = stem + "i";
  116. }
  117. }
  118. // Step 2
  119. re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
  120. if (re.test(w)) {
  121. var fp = re.exec(w);
  122. stem = fp[1];
  123. suffix = fp[2];
  124. re = new RegExp(mgr0);
  125. if (re.test(stem)) {
  126. w = stem + step2list[suffix];
  127. }
  128. }
  129. // Step 3
  130. re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
  131. if (re.test(w)) {
  132. var fp = re.exec(w);
  133. stem = fp[1];
  134. suffix = fp[2];
  135. re = new RegExp(mgr0);
  136. if (re.test(stem)) {
  137. w = stem + step3list[suffix];
  138. }
  139. }
  140. // Step 4
  141. re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
  142. re2 = /^(.+?)(s|t)(ion)$/;
  143. if (re.test(w)) {
  144. var fp = re.exec(w);
  145. stem = fp[1];
  146. re = new RegExp(mgr1);
  147. if (re.test(stem)) {
  148. w = stem;
  149. }
  150. } else if (re2.test(w)) {
  151. var fp = re2.exec(w);
  152. stem = fp[1] + fp[2];
  153. re2 = new RegExp(mgr1);
  154. if (re2.test(stem)) {
  155. w = stem;
  156. }
  157. }
  158. // Step 5
  159. re = /^(.+?)e$/;
  160. if (re.test(w)) {
  161. var fp = re.exec(w);
  162. stem = fp[1];
  163. re = new RegExp(mgr1);
  164. re2 = new RegExp(meq1);
  165. re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
  166. if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
  167. w = stem;
  168. }
  169. }
  170. re = /ll$/;
  171. re2 = new RegExp(mgr1);
  172. if (re.test(w) && re2.test(w)) {
  173. re = /.$/;
  174. w = w.replace(re, "");
  175. }
  176. // and turn initial Y back to y
  177. if (firstch == "y") {
  178. w = firstch.toLowerCase() + w.substr(1);
  179. }
  180. return w;
  181. }
  182. })();
  183. });