fr_stemmer.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. /*
  2. * Author: Kasun Gajasinghe
  3. * E-Mail: kasunbg AT gmail DOT com
  4. * Date: 09.08.2010
  5. *
  6. * usage: stemmer(word);
  7. * ex: var stem = stemmer(foobar);
  8. * Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html
  9. *
  10. * LICENSE:
  11. *
  12. * Copyright (c) 2010, Kasun Gajasinghe. All rights reserved.
  13. *
  14. * Redistribution and use in source and binary forms, with or without modification,
  15. * are permitted provided that the following conditions are met:
  16. *
  17. * 1. Redistributions of source code must retain the above copyright notice,
  18. * this list of conditions and the following disclaimer.
  19. *
  20. * 2. Redistributions in binary form must reproduce the above copyright notice,
  21. * this list of conditions and the following disclaimer in the documentation
  22. * and/or other materials provided with the distribution.
  23. *
  24. *
  25. * THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
  26. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  27. * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT,
  28. * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  29. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  30. * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  31. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  32. * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33. *
  34. */
  35. define(function () {
  36. return function (word) {
  37. // Letters in French include the following accented forms,
  38. // â à ç ë é ê è ï î ô û ù
  39. // The following letters are vowels:
  40. // a e i o u y â à ë é ê è ï î ô û ù
  41. word = word.toLowerCase();
  42. var oriWord = word;
  43. word = word.replace(/qu/g, 'qU'); //have to perform first, as after the operation, capital U is not treated as a vowel
  44. word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2');
  45. word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2');
  46. word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y');
  47. word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1');
  48. var rv = '';
  49. var rvIndex = -1;
  50. if (word.search(/^(par|col|tap)/) != -1 || word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1) {
  51. rv = word.substring(3);
  52. rvIndex = 3;
  53. } else {
  54. rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/);
  55. if (rvIndex != -1) {
  56. rvIndex += 2; //+2 is to supplement the substring(1) used to find rvIndex
  57. rv = word.substring(rvIndex);
  58. } else {
  59. rvIndex = word.length;
  60. }
  61. }
  62. // R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
  63. // R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
  64. var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
  65. var r1 = '';
  66. if (r1Index != -1) {
  67. r1Index += 2;
  68. r1 = word.substring(r1Index);
  69. } else {
  70. r1Index = word.length;
  71. }
  72. var r2Index = -1;
  73. var r2 = '';
  74. if (r1Index != -1) {
  75. r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
  76. if (r2Index != -1) {
  77. r2Index += 2;
  78. r2 = r1.substring(r2Index);
  79. r2Index += r1Index;
  80. } else {
  81. r2 = '';
  82. r2Index = word.length;
  83. }
  84. }
  85. if (r1Index != -1 && r1Index < 3) {
  86. r1Index = 3;
  87. r1 = word.substring(r1Index);
  88. }
  89. /*
  90. Step 1: Standard suffix removal
  91. */
  92. var a1Index = word.search(/(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/);
  93. var a2Index = word.search(/(atrice|ateur|ation|atrices|ateurs|ations)$/);
  94. var a3Index = word.search(/(logie|logies)$/);
  95. var a4Index = word.search(/(usion|ution|usions|utions)$/);
  96. var a5Index = word.search(/(ence|ences)$/);
  97. var a6Index = word.search(/(ement|ements)$/);
  98. var a7Index = word.search(/(ité|ités)$/);
  99. var a8Index = word.search(/(if|ive|ifs|ives)$/);
  100. var a9Index = word.search(/(eaux)$/);
  101. var a10Index = word.search(/(aux)$/);
  102. var a11Index = word.search(/(euse|euses)$/);
  103. var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement|issements)$/);
  104. var a13Index = word.search(/(amment)$/);
  105. var a14Index = word.search(/(emment)$/);
  106. var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment|ments)$/);
  107. if (a1Index != -1 && a1Index >= r2Index) {
  108. word = word.substring(0, a1Index);
  109. } else if (a2Index != -1 && a2Index >= r2Index) {
  110. word = word.substring(0, a2Index);
  111. var a2Index2 = word.search(/(ic)$/);
  112. if (a2Index2 != -1 && a2Index2 >= r2Index) {
  113. word = word.substring(0, a2Index2); //if preceded by ic, delete if in R2,
  114. } else { //else replace by iqU
  115. word = word.replace(/(ic)$/, 'iqU');
  116. }
  117. } else if (a3Index != -1 && a3Index >= r2Index) {
  118. word = word.replace(/(logie|logies)$/, 'log'); //replace with log if in R2
  119. } else if (a4Index != -1 && a4Index >= r2Index) {
  120. word = word.replace(/(usion|ution|usions|utions)$/, 'u'); //replace with u if in R2
  121. } else if (a5Index != -1 && a5Index >= r2Index) {
  122. word = word.replace(/(ence|ences)$/, 'ent'); //replace with ent if in R2
  123. } else if (a6Index != -1 && a6Index >= rvIndex) {
  124. word = word.substring(0, a6Index);
  125. if (word.search(/(iv)$/) >= r2Index) {
  126. word = word.replace(/(iv)$/, '');
  127. if (word.search(/(at)$/) >= r2Index) {
  128. word = word.replace(/(at)$/, '');
  129. }
  130. } else if (word.search(/(eus)$/) != -1) {
  131. var a6Index2 = word.search(/(eus)$/);
  132. if (a6Index2 >= r2Index) {
  133. word = word.substring(0, a6Index2);
  134. } else if (a6Index2 >= r1Index) {
  135. word = word.substring(0, a6Index2) + "eux";
  136. }
  137. } else if (word.search(/(abl|iqU)$/) >= r2Index) {
  138. word = word.replace(/(abl|iqU)$/, ''); //if preceded by abl or iqU, delete if in R2,
  139. } else if (word.search(/(ièr|Ièr)$/) >= rvIndex) {
  140. word = word.replace(/(ièr|Ièr)$/, 'i'); //if preceded by abl or iqU, delete if in R2,
  141. }
  142. } else if (a7Index != -1 && a7Index >= r2Index) {
  143. word = word.substring(0, a7Index); //delete if in R2
  144. if (word.search(/(abil)$/) != -1) { //if preceded by abil, delete if in R2, else replace by abl, otherwise,
  145. var a7Index2 = word.search(/(abil)$/);
  146. if (a7Index2 >= r2Index) {
  147. word = word.substring(0, a7Index2);
  148. } else {
  149. word = word.substring(0, a7Index2) + "abl";
  150. }
  151. } else if (word.search(/(ic)$/) != -1) {
  152. var a7Index3 = word.search(/(ic)$/);
  153. if (a7Index3 != -1 && a7Index3 >= r2Index) {
  154. word = word.substring(0, a7Index3); //if preceded by ic, delete if in R2,
  155. } else { //else replace by iqU
  156. word = word.replace(/(ic)$/, 'iqU');
  157. }
  158. } else if (word.search(/(iv)$/) != r2Index) {
  159. word = word.replace(/(iv)$/, '');
  160. }
  161. } else if (a8Index != -1 && a8Index >= r2Index) {
  162. word = word.substring(0, a8Index);
  163. if (word.search(/(at)$/) >= r2Index) {
  164. word = word.replace(/(at)$/, '');
  165. if (word.search(/(ic)$/) >= r2Index) {
  166. word = word.replace(/(ic)$/, '');
  167. } else {
  168. word = word.replace(/(ic)$/, 'iqU');
  169. }
  170. }
  171. } else if (a9Index != -1) {
  172. word = word.replace(/(eaux)/, 'eau')
  173. } else if (a10Index >= r1Index) {
  174. word = word.replace(/(aux)/, 'al')
  175. } else if (a11Index != -1) {
  176. var a11Index2 = word.search(/(euse|euses)$/);
  177. if (a11Index2 >= r2Index) {
  178. word = word.substring(0, a11Index2);
  179. } else if (a11Index2 >= r1Index) {
  180. word = word.substring(0, a11Index2) + "eux";
  181. }
  182. } else if (a12Index != -1 && a12Index >= r1Index) {
  183. word = word.substring(0, a12Index + 1); //+1- amendment to non-vowel
  184. } else if (a13Index != -1 && a13Index >= rvIndex) {
  185. word = word.replace(/(amment)$/, 'ant');
  186. } else if (a14Index != -1 && a14Index >= rvIndex) {
  187. word = word.replace(/(emment)$/, 'ent');
  188. } else if (a15Index != -1 && a15Index >= rvIndex) {
  189. word = word.substring(0, a15Index + 1);
  190. }
  191. /* Step 2a: Verb suffixes beginning i*/
  192. var wordStep1 = word;
  193. var step2aDone = false;
  194. if (oriWord == word.toLowerCase() || oriWord.search(/(amment|emment|ment|ments)$/) != -1) {
  195. step2aDone = true;
  196. var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i;
  197. if (word.search(b1Regex) >= rvIndex) {
  198. word = word.replace(b1Regex, '$1');
  199. }
  200. }
  201. /* Step 2b: Other verb suffixes*/
  202. if (step2aDone && wordStep1 == word) {
  203. if (word.search(/(ions)$/) >= r2Index) {
  204. word = word.replace(/(ions)$/, '');
  205. } else {
  206. var b2Regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i;
  207. if (word.search(b2Regex) >= rvIndex) {
  208. word = word.replace(b2Regex, '');
  209. } else {
  210. var b3Regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
  211. if (word.search(b3Regex) >= rvIndex) {
  212. word = word.replace(b3Regex, '');
  213. } else {
  214. var b3Regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
  215. if (word.search(b3Regex2) >= rvIndex) {
  216. word = word.replace(b3Regex2, '');
  217. }
  218. }
  219. }
  220. }
  221. }
  222. if (oriWord != word.toLowerCase()) {
  223. /* Step 3 */
  224. var rep = '';
  225. if (word.search(/Y$/) != -1) {
  226. word = word.replace(/Y$/, 'i');
  227. } else if (word.search(/ç$/) != -1) {
  228. word = word.replace(/ç$/, 'c');
  229. }
  230. } else {
  231. /* Step 4 */
  232. //If the word ends s, not preceded by a, i, o, u, è or s, delete it.
  233. if (word.search(/([^aiouès])s$/) >= rvIndex) {
  234. word = word.replace(/([^aiouès])s$/, '$1');
  235. }
  236. var e1Index = word.search(/ion$/);
  237. if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) {
  238. word = word.substring(0, e1Index);
  239. } else {
  240. var e2Index = word.search(/(ier|ière|Ier|Ière)$/);
  241. if (e2Index != -1 && e2Index >= rvIndex) {
  242. word = word.substring(0, e2Index) + "i";
  243. } else {
  244. if (word.search(/e$/) >= rvIndex) {
  245. word = word.replace(/e$/, ''); //delete last e
  246. } else if (word.search(/guë$/) >= rvIndex) {
  247. word = word.replace(/guë$/, 'gu');
  248. }
  249. }
  250. }
  251. }
  252. /* Step 5: Undouble */
  253. //word = word.replace(/(en|on|et|el|eil)(n|t|l)$/,'$1');
  254. word = word.replace(/(en|on)(n)$/, '$1');
  255. word = word.replace(/(ett)$/, 'et');
  256. word = word.replace(/(el|eil)(l)$/, '$1');
  257. /* Step 6: Un-accent */
  258. word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/, 'e$1');
  259. word = word.toLowerCase();
  260. return word;
  261. };
  262. });