processor_stemmer.inc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732
  1. <?php
  2. /**
  3. * @file
  4. * Contains SearchApiPorterStemmer and SearchApiPorter2.
  5. */
  6. /**
  7. * Stems words to their roots.
  8. */
  9. class SearchApiPorterStemmer extends SearchApiAbstractProcessor {
  10. /**
  11. * Static cache for already generated stems.
  12. *
  13. * @var array
  14. */
  15. protected $stems = array();
  16. /**
  17. * {@inheritdoc}
  18. */
  19. public function configurationForm() {
  20. $form = parent::configurationForm();
  21. $args = array(
  22. '!algorithm' => url('https://github.com/markfullmer/porter2'),
  23. '!exclusions' => url('https://github.com/markfullmer/porter2#user-content-custom-exclusions'),
  24. );
  25. $form += array(
  26. 'help' => array(
  27. '#markup' => '<p>' . t('Optionally, provide an exclusion list to override the stemmer algorithm. Read about the <a href="@algorithm">algorithm</a> and <a href="@exclusions">exclusions</a>.', $args) . '</p>',
  28. ),
  29. 'exceptions' => array(
  30. '#type' => 'textarea',
  31. '#title' => t('Exceptions'),
  32. '#description' => t('Enter exceptions in the form of WORD=STEM, where "WORD" is the term entered and "STEM" is the resulting stem. List each exception on a separate line.'),
  33. '#default_value' => "texan=texa",
  34. ),
  35. );
  36. if (!empty($this->options['exceptions'])) {
  37. $form['exceptions']['#default_value'] = $this->options['exceptions'];
  38. }
  39. return $form;
  40. }
  41. /**
  42. * {@inheritdoc}
  43. */
  44. protected function process(&$value) {
  45. // Load custom exceptions.
  46. $exceptions = $this->getExceptions();
  47. $words = preg_split('/[^\p{L}\p{N}]+/u', $value, -1 , PREG_SPLIT_DELIM_CAPTURE);
  48. $stemmed = array();
  49. foreach ($words as $i => $word) {
  50. if ($i % 2 == 0 && strlen($word)) {
  51. if (!isset($this->stems[$word])) {
  52. $stem = new SearchApiPorter2($word, $exceptions);
  53. $this->stems[$word] = $stem->stem();
  54. }
  55. $stemmed[] = $this->stems[$word];
  56. }
  57. else {
  58. $stemmed[] = $word;
  59. }
  60. }
  61. $value = implode('', $stemmed);
  62. }
  63. /**
  64. * Retrieves the processor's configured exceptions.
  65. *
  66. * @return string[]
  67. * An associative array of exceptions, with words as keys and stems as their
  68. * replacements.
  69. */
  70. protected function getExceptions() {
  71. if (!empty($this->options['exceptions'])) {
  72. $exceptions = parse_ini_string($this->options['exceptions'], TRUE);
  73. return is_array($exceptions) ? $exceptions : array();
  74. }
  75. return array();
  76. }
  77. }
  78. /**
  79. * Implements the Porter2 stemming algorithm.
  80. *
  81. * @see https://github.com/markfullmer/porter2
  82. */
  83. class SearchApiPorter2 {
  84. /**
  85. * The word being stemmed.
  86. *
  87. * @var string
  88. */
  89. protected $word;
  90. /**
  91. * The R1 of the word.
  92. *
  93. * @var int
  94. *
  95. * @see http://snowball.tartarus.org/texts/r1r2.html.
  96. */
  97. protected $r1;
  98. /**
  99. * The R2 of the word.
  100. *
  101. * @var int
  102. *
  103. * @see http://snowball.tartarus.org/texts/r1r2.html.
  104. */
  105. protected $r2;
  106. /**
  107. * List of exceptions to be used.
  108. *
  109. * @var string[]
  110. */
  111. protected $exceptions = array();
  112. /**
  113. * Constructs a SearchApiPorter2 object.
  114. *
  115. * @param string $word
  116. * The word to stem.
  117. * @param string[] $custom_exceptions
  118. * (optional) A custom list of exceptions.
  119. */
  120. public function __construct($word, $custom_exceptions = array()) {
  121. $this->word = $word;
  122. $this->exceptions = $custom_exceptions + array(
  123. 'skis' => 'ski',
  124. 'skies' => 'sky',
  125. 'dying' => 'die',
  126. 'lying' => 'lie',
  127. 'tying' => 'tie',
  128. 'idly' => 'idl',
  129. 'gently' => 'gentl',
  130. 'ugly' => 'ugli',
  131. 'early' => 'earli',
  132. 'only' => 'onli',
  133. 'singly' => 'singl',
  134. 'sky' => 'sky',
  135. 'news' => 'news',
  136. 'howe' => 'howe',
  137. 'atlas' => 'atlas',
  138. 'cosmos' => 'cosmos',
  139. 'bias' => 'bias',
  140. 'andes' => 'andes',
  141. );
  142. // Set initial y, or y after a vowel, to Y.
  143. $inc = 0;
  144. while ($inc <= $this->length()) {
  145. if (substr($this->word, $inc, 1) === 'y' && ($inc == 0 || $this->isVowel($inc - 1))) {
  146. $this->word = substr_replace($this->word, 'Y', $inc, 1);
  147. }
  148. $inc++;
  149. }
  150. // Establish the regions R1 and R2. See function R().
  151. $this->r1 = $this->R(1);
  152. $this->r2 = $this->R(2);
  153. }
  154. /**
  155. * Computes the stem of the word.
  156. *
  157. * @return string
  158. * The word's stem.
  159. */
  160. public function stem() {
  161. // Ignore exceptions & words that are two letters or less.
  162. if ($this->exceptions() || $this->length() <= 2) {
  163. return strtolower($this->word);
  164. }
  165. else {
  166. $this->step0();
  167. $this->step1a();
  168. $this->step1b();
  169. $this->step1c();
  170. $this->step2();
  171. $this->step3();
  172. $this->step4();
  173. $this->step5();
  174. }
  175. return strtolower($this->word);
  176. }
  177. /**
  178. * Determines whether the word is contained in our list of exceptions.
  179. *
  180. * If so, the $word property is changed to the stem listed in the exceptions.
  181. *
  182. * @return bool
  183. * TRUE if the word was an exception, FALSE otherwise.
  184. */
  185. protected function exceptions() {
  186. if (isset($this->exceptions[$this->word])) {
  187. $this->word = $this->exceptions[$this->word];
  188. return TRUE;
  189. }
  190. return FALSE;
  191. }
  192. /**
  193. * Searches for the longest among the "s" suffixes and removes it.
  194. *
  195. * Implements step 0 of the Porter2 algorithm.
  196. */
  197. protected function step0() {
  198. $found = FALSE;
  199. $checks = array("'s'", "'s", "'");
  200. foreach ($checks as $check) {
  201. if (!$found && $this->hasEnding($check)) {
  202. $this->removeEnding($check);
  203. $found = TRUE;
  204. }
  205. }
  206. }
  207. /**
  208. * Handles various suffixes, of which the longest is replaced.
  209. *
  210. * Implements step 1a of the Porter2 algorithm.
  211. */
  212. protected function step1a() {
  213. $found = FALSE;
  214. if ($this->hasEnding('sses')) {
  215. $this->removeEnding('sses');
  216. $this->addEnding('ss');
  217. $found = TRUE;
  218. }
  219. $checks = array('ied', 'ies');
  220. foreach ($checks as $check) {
  221. if (!$found && $this->hasEnding($check)) {
  222. $length = $this->length();
  223. $this->removeEnding($check);
  224. if ($length > 4) {
  225. $this->addEnding('i');
  226. }
  227. else {
  228. $this->addEnding('ie');
  229. }
  230. $found = TRUE;
  231. }
  232. }
  233. if ($this->hasEnding('us') || $this->hasEnding('ss')) {
  234. $found = TRUE;
  235. }
  236. // Delete if preceding word part has a vowel not immediately before the s.
  237. if (!$found && $this->hasEnding('s') && $this->containsVowel(substr($this->word, 0, -2))) {
  238. $this->removeEnding('s');
  239. }
  240. }
  241. /**
  242. * Handles various suffixes, of which the longest is replaced.
  243. *
  244. * Implements step 1b of the Porter2 algorithm.
  245. */
  246. protected function step1b() {
  247. $exceptions = array(
  248. 'inning',
  249. 'outing',
  250. 'canning',
  251. 'herring',
  252. 'earring',
  253. 'proceed',
  254. 'exceed',
  255. 'succeed',
  256. );
  257. if (in_array($this->word, $exceptions)) {
  258. return;
  259. }
  260. $checks = array('eedly', 'eed');
  261. foreach ($checks as $check) {
  262. if ($this->hasEnding($check)) {
  263. if ($this->r1 !== $this->length()) {
  264. $this->removeEnding($check);
  265. $this->addEnding('ee');
  266. }
  267. return;
  268. }
  269. }
  270. $checks = array('ingly', 'edly', 'ing', 'ed');
  271. $second_endings = array('at', 'bl', 'iz');
  272. foreach ($checks as $check) {
  273. // If the ending is present and the previous part contains a vowel.
  274. if ($this->hasEnding($check) && $this->containsVowel(substr($this->word, 0, -strlen($check)))) {
  275. $this->removeEnding($check);
  276. foreach ($second_endings as $ending) {
  277. if ($this->hasEnding($ending)) {
  278. $this->addEnding('e');
  279. return;
  280. }
  281. }
  282. // If the word ends with a double, remove the last letter.
  283. $found = $this->removeDoubles();
  284. // If the word is short, add e (so hop -> hope).
  285. if (!$found && ($this->isShort())) {
  286. $this->addEnding('e');
  287. }
  288. return;
  289. }
  290. }
  291. }
  292. /**
  293. * Replaces suffix y or Y with i if after non-vowel not @ word begin.
  294. *
  295. * Implements step 1c of the Porter2 algorithm.
  296. */
  297. protected function step1c() {
  298. if (($this->hasEnding('y') || $this->hasEnding('Y')) && $this->length() > 2 && !($this->isVowel($this->length() - 2))) {
  299. $this->removeEnding('y');
  300. $this->addEnding('i');
  301. }
  302. }
  303. /**
  304. * Implements step 2 of the Porter2 algorithm.
  305. */
  306. protected function step2() {
  307. $checks = array(
  308. "ization" => "ize",
  309. "iveness" => "ive",
  310. "fulness" => "ful",
  311. "ational" => "ate",
  312. "ousness" => "ous",
  313. "biliti" => "ble",
  314. "tional" => "tion",
  315. "lessli" => "less",
  316. "fulli" => "ful",
  317. "entli" => "ent",
  318. "ation" => "ate",
  319. "aliti" => "al",
  320. "iviti" => "ive",
  321. "ousli" => "ous",
  322. "alism" => "al",
  323. "abli" => "able",
  324. "anci" => "ance",
  325. "alli" => "al",
  326. "izer" => "ize",
  327. "enci" => "ence",
  328. "ator" => "ate",
  329. "bli" => "ble",
  330. "ogi" => "og",
  331. );
  332. foreach ($checks as $find => $replace) {
  333. if ($this->hasEnding($find)) {
  334. if ($this->inR1($find)) {
  335. $this->removeEnding($find);
  336. $this->addEnding($replace);
  337. }
  338. return;
  339. }
  340. }
  341. if ($this->hasEnding('li')) {
  342. if ($this->length() > 4 && $this->validLi($this->charAt(-3))) {
  343. $this->removeEnding('li');
  344. }
  345. }
  346. }
  347. /**
  348. * Implements step 3 of the Porter2 algorithm.
  349. */
  350. protected function step3() {
  351. $checks = array(
  352. 'ational' => 'ate',
  353. 'tional' => 'tion',
  354. 'alize' => 'al',
  355. 'icate' => 'ic',
  356. 'iciti' => 'ic',
  357. 'ical' => 'ic',
  358. 'ness' => '',
  359. 'ful' => '',
  360. );
  361. foreach ($checks as $find => $replace) {
  362. if ($this->hasEnding($find)) {
  363. if ($this->inR1($find)) {
  364. $this->removeEnding($find);
  365. $this->addEnding($replace);
  366. }
  367. return;
  368. }
  369. }
  370. if ($this->hasEnding('ative')) {
  371. if ($this->inR2('ative')) {
  372. $this->removeEnding('ative');
  373. }
  374. }
  375. }
  376. /**
  377. * Implements step 4 of the Porter2 algorithm.
  378. */
  379. protected function step4() {
  380. $checks = array(
  381. 'ement',
  382. 'ment',
  383. 'ance',
  384. 'ence',
  385. 'able',
  386. 'ible',
  387. 'ant',
  388. 'ent',
  389. 'ion',
  390. 'ism',
  391. 'ate',
  392. 'iti',
  393. 'ous',
  394. 'ive',
  395. 'ize',
  396. 'al',
  397. 'er',
  398. 'ic',
  399. );
  400. foreach ($checks as $check) {
  401. // Among the suffixes, if found and in R2, delete.
  402. if ($this->hasEnding($check)) {
  403. if ($this->inR2($check)) {
  404. if ($check !== 'ion' || in_array($this->charAt(-4), array('s', 't'))) {
  405. $this->removeEnding($check);
  406. }
  407. }
  408. return;
  409. }
  410. }
  411. }
  412. /**
  413. * Implements step 5 of the Porter2 algorithm.
  414. */
  415. protected function step5() {
  416. if ($this->hasEnding('e')) {
  417. // Delete if in R2, or in R1 and not preceded by a short syllable.
  418. if ($this->inR2('e') || ($this->inR1('e') && !$this->isShortSyllable($this->length() - 3))) {
  419. $this->removeEnding('e');
  420. }
  421. return;
  422. }
  423. if ($this->hasEnding('l')) {
  424. // Delete if in R2 and preceded by l.
  425. if ($this->inR2('l') && $this->charAt(-2) == 'l') {
  426. $this->removeEnding('l');
  427. }
  428. }
  429. }
  430. /**
  431. * Removes certain double consonants from the word's end.
  432. *
  433. * @return bool
  434. * TRUE if a match was found and removed, FALSE otherwise.
  435. */
  436. protected function removeDoubles() {
  437. $found = FALSE;
  438. $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
  439. foreach ($doubles as $double) {
  440. if (substr($this->word, -2) == $double) {
  441. $this->word = substr($this->word, 0, -1);
  442. $found = TRUE;
  443. break;
  444. }
  445. }
  446. return $found;
  447. }
  448. /**
  449. * Checks whether a character is a vowel.
  450. *
  451. * @param int $position
  452. * The character's position.
  453. * @param string|null $word
  454. * (optional) The word in which to check. Defaults to $this->word.
  455. * @param string[] $additional
  456. * (optional) Additional characters that should count as vowels.
  457. *
  458. * @return bool
  459. * TRUE if the character is a vowel, FALSE otherwise.
  460. */
  461. protected function isVowel($position, $word = NULL, $additional = array()) {
  462. if ($word === NULL) {
  463. $word = $this->word;
  464. }
  465. $vowels = array_merge(array('a', 'e', 'i', 'o', 'u', 'y'), $additional);
  466. return in_array($this->charAt($position, $word), $vowels);
  467. }
  468. /**
  469. * Retrieves the character at the given position.
  470. *
  471. * @param int $position
  472. * The 0-based index of the character. If a negative number is given, the
  473. * position is counted from the end of the string.
  474. * @param string|null $word
  475. * (optional) The word from which to retrieve the character. Defaults to
  476. * $this->word.
  477. *
  478. * @return string
  479. * The character at the given position, or an empty string if the given
  480. * position was illegal.
  481. */
  482. protected function charAt($position, $word = NULL) {
  483. if ($word === NULL) {
  484. $word = $this->word;
  485. }
  486. $length = strlen($word);
  487. if (abs($position) >= $length) {
  488. return '';
  489. }
  490. if ($position < 0) {
  491. $position += $length;
  492. }
  493. return $word[$position];
  494. }
  495. /**
  496. * Determines whether the word ends in a "vowel-consonant" suffix.
  497. *
  498. * Unless the word is only two characters long, it also checks that the
  499. * third-last character is neither "w", "x" nor "Y".
  500. *
  501. * @param int|null $position
  502. * (optional) If given, do not check the end of the word, but the character
  503. * at the given position, and the next one.
  504. *
  505. * @return bool
  506. * TRUE if the word has the described suffix, FALSE otherwise.
  507. */
  508. protected function isShortSyllable($position = NULL) {
  509. if ($position === NULL) {
  510. $position = $this->length() - 2;
  511. }
  512. // A vowel at the beginning of the word followed by a non-vowel.
  513. if ($position === 0) {
  514. return $this->isVowel(0) && !$this->isVowel(1);
  515. }
  516. // Vowel followed by non-vowel other than w, x, Y and preceded by
  517. // non-vowel.
  518. $additional = array('w', 'x', 'Y');
  519. return !$this->isVowel($position - 1) && $this->isVowel($position) && !$this->isVowel($position + 1, NULL, $additional);
  520. }
  521. /**
  522. * Determines whether the word is short.
  523. *
  524. * A word is called short if it ends in a short syllable and if R1 is null.
  525. *
  526. * @return bool
  527. * TRUE if the word is short, FALSE otherwise.
  528. */
  529. protected function isShort() {
  530. return $this->isShortSyllable() && $this->r1 == $this->length();
  531. }
  532. /**
  533. * Determines the start of a certain "R" region.
  534. *
  535. * R is a region after the first non-vowel following a vowel, or end of word.
  536. *
  537. * @param int $type
  538. * (optional) 1 or 2. If 2, then calculate the R after the R1.
  539. *
  540. * @return int
  541. * The R position.
  542. */
  543. protected function R($type = 1) {
  544. $inc = 1;
  545. if ($type === 2) {
  546. $inc = $this->r1;
  547. }
  548. elseif ($this->length() > 5) {
  549. $prefix_5 = substr($this->word, 0, 5);
  550. if ($prefix_5 === 'gener' || $prefix_5 === 'arsen') {
  551. return 5;
  552. }
  553. if ($this->length() > 6 && substr($this->word, 0, 6) === 'commun') {
  554. return 6;
  555. }
  556. }
  557. while ($inc <= $this->length()) {
  558. if (!$this->isVowel($inc) && $this->isVowel($inc - 1)) {
  559. $position = $inc;
  560. break;
  561. }
  562. $inc++;
  563. }
  564. if (!isset($position)) {
  565. $position = $this->length();
  566. }
  567. else {
  568. // We add one, as this is the position AFTER the first non-vowel.
  569. $position++;
  570. }
  571. return $position;
  572. }
  573. /**
  574. * Checks whether the given string is contained in R1.
  575. *
  576. * @param string $string
  577. * The string.
  578. *
  579. * @return bool
  580. * TRUE if the string is in R1, FALSE otherwise.
  581. */
  582. protected function inR1($string) {
  583. $r1 = substr($this->word, $this->r1);
  584. return strpos($r1, $string) !== FALSE;
  585. }
  586. /**
  587. * Checks whether the given string is contained in R2.
  588. *
  589. * @param string $string
  590. * The string.
  591. *
  592. * @return bool
  593. * TRUE if the string is in R2, FALSE otherwise.
  594. */
  595. protected function inR2($string) {
  596. $r2 = substr($this->word, $this->r2);
  597. return strpos($r2, $string) !== FALSE;
  598. }
  599. /**
  600. * Determines the string length of the current word.
  601. *
  602. * @return int
  603. * The string length of the current word.
  604. */
  605. protected function length() {
  606. return strlen($this->word);
  607. }
  608. /**
  609. * Checks whether the word ends with the given string.
  610. *
  611. * @param string $string
  612. * The string.
  613. *
  614. * @return bool
  615. * TRUE if the word ends with the given string, FALSE otherwise.
  616. */
  617. protected function hasEnding($string) {
  618. $length = strlen($string);
  619. if ($length > $this->length()) {
  620. return FALSE;
  621. }
  622. return (substr_compare($this->word, $string, -1 * $length, $length) === 0);
  623. }
  624. /**
  625. * Appends a given string to the current word.
  626. *
  627. * @param string $string
  628. * The ending to append.
  629. */
  630. protected function addEnding($string) {
  631. $this->word = $this->word . $string;
  632. }
  633. /**
  634. * Removes a given string from the end of the current word.
  635. *
  636. * Does not check whether the ending is actually there.
  637. *
  638. * @param string $string
  639. * The ending to remove.
  640. */
  641. protected function removeEnding($string) {
  642. $this->word = substr($this->word, 0, -strlen($string));
  643. }
  644. /**
  645. * Checks whether the given string contains a vowel.
  646. *
  647. * @param string $string
  648. * The string to check.
  649. *
  650. * @return bool
  651. * TRUE if the string contains a vowel, FALSE otherwise.
  652. */
  653. protected function containsVowel($string) {
  654. $inc = 0;
  655. $return = FALSE;
  656. while ($inc < strlen($string)) {
  657. if ($this->isVowel($inc, $string)) {
  658. $return = TRUE;
  659. break;
  660. }
  661. $inc++;
  662. }
  663. return $return;
  664. }
  665. /**
  666. * Checks whether the given string is a valid -li prefix.
  667. *
  668. * @param string $string
  669. * The string to check.
  670. *
  671. * @return bool
  672. * TRUE if the given string is a valid -li prefix, FALSE otherwise.
  673. */
  674. protected function validLi($string) {
  675. return in_array($string, array(
  676. 'c',
  677. 'd',
  678. 'e',
  679. 'g',
  680. 'h',
  681. 'k',
  682. 'm',
  683. 'n',
  684. 'r',
  685. 't',
  686. ));
  687. }
  688. }