processor_stemmer.inc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731
  1. <?php
  2. /**
  3. * @file
  4. * Contains SearchApiPorterStemmer and SearchApiPorter2.
  5. */
  6. /**
  7. * Stems words to their roots.
  8. */
  9. class SearchApiPorterStemmer extends SearchApiAbstractProcessor {
  10. /**
  11. * Static cache for already generated stems.
  12. *
  13. * @var array
  14. */
  15. protected $stems = array();
  16. /**
  17. * {@inheritdoc}
  18. */
  19. public function configurationForm() {
  20. $form = parent::configurationForm();
  21. $args = array(
  22. '@algorithm' => url('http://snowball.tartarus.org/algorithms/english/stemmer.html'),
  23. );
  24. $form += array(
  25. 'help' => array(
  26. '#markup' => '<p>' . t('Optionally, provide an exclusion list to override the stemmer algorithm. (<a href="@algorithm">Read about the algorithm</a>.)', $args) . '</p>',
  27. ),
  28. 'exceptions' => array(
  29. '#type' => 'textarea',
  30. '#title' => t('Exceptions'),
  31. '#description' => t('Enter exceptions in the form of WORD=STEM, where "WORD" is the term entered and "STEM" is the resulting stem. List each exception on a separate line.'),
  32. '#default_value' => "texan=texa",
  33. ),
  34. );
  35. if (!empty($this->options['exceptions'])) {
  36. $form['exceptions']['#default_value'] = $this->options['exceptions'];
  37. }
  38. return $form;
  39. }
  40. /**
  41. * {@inheritdoc}
  42. */
  43. protected function process(&$value) {
  44. // Load custom exceptions.
  45. $exceptions = $this->getExceptions();
  46. $words = preg_split('/[^\p{L}\p{N}]+/u', $value, -1 , PREG_SPLIT_DELIM_CAPTURE);
  47. $stemmed = array();
  48. foreach ($words as $i => $word) {
  49. if ($i % 2 == 0 && strlen($word)) {
  50. if (!isset($this->stems[$word])) {
  51. $stem = new SearchApiPorter2($word, $exceptions);
  52. $this->stems[$word] = $stem->stem();
  53. }
  54. $stemmed[] = $this->stems[$word];
  55. }
  56. else {
  57. $stemmed[] = $word;
  58. }
  59. }
  60. $value = implode(' ', $stemmed);
  61. }
  62. /**
  63. * Retrieves the processor's configured exceptions.
  64. *
  65. * @return string[]
  66. * An associative array of exceptions, with words as keys and stems as their
  67. * replacements.
  68. */
  69. protected function getExceptions() {
  70. if (!empty($this->options['exceptions'])) {
  71. $exceptions = parse_ini_string($this->options['exceptions'], TRUE);
  72. return is_array($exceptions) ? $exceptions : array();
  73. }
  74. return array();
  75. }
  76. }
  77. /**
  78. * Implements the Porter2 stemming algorithm.
  79. *
  80. * @see https://github.com/markfullmer/porter2
  81. */
  82. class SearchApiPorter2 {
  83. /**
  84. * The word being stemmed.
  85. *
  86. * @var string
  87. */
  88. protected $word;
  89. /**
  90. * The R1 of the word.
  91. *
  92. * @var int
  93. *
  94. * @see http://snowball.tartarus.org/texts/r1r2.html.
  95. */
  96. protected $r1;
  97. /**
  98. * The R2 of the word.
  99. *
  100. * @var int
  101. *
  102. * @see http://snowball.tartarus.org/texts/r1r2.html.
  103. */
  104. protected $r2;
  105. /**
  106. * List of exceptions to be used.
  107. *
  108. * @var string[]
  109. */
  110. protected $exceptions = array();
  111. /**
  112. * Constructs a SearchApiPorter2 object.
  113. *
  114. * @param string $word
  115. * The word to stem.
  116. * @param string[] $custom_exceptions
  117. * (optional) A custom list of exceptions.
  118. */
  119. public function __construct($word, $custom_exceptions = array()) {
  120. $this->word = $word;
  121. $this->exceptions = $custom_exceptions + array(
  122. 'skis' => 'ski',
  123. 'skies' => 'sky',
  124. 'dying' => 'die',
  125. 'lying' => 'lie',
  126. 'tying' => 'tie',
  127. 'idly' => 'idl',
  128. 'gently' => 'gentl',
  129. 'ugly' => 'ugli',
  130. 'early' => 'earli',
  131. 'only' => 'onli',
  132. 'singly' => 'singl',
  133. 'sky' => 'sky',
  134. 'news' => 'news',
  135. 'howe' => 'howe',
  136. 'atlas' => 'atlas',
  137. 'cosmos' => 'cosmos',
  138. 'bias' => 'bias',
  139. 'andes' => 'andes',
  140. );
  141. // Set initial y, or y after a vowel, to Y.
  142. $inc = 0;
  143. while ($inc <= $this->length()) {
  144. if (substr($this->word, $inc, 1) === 'y' && ($inc == 0 || $this->isVowel($inc - 1))) {
  145. $this->word = substr_replace($this->word, 'Y', $inc, 1);
  146. }
  147. $inc++;
  148. }
  149. // Establish the regions R1 and R2. See function R().
  150. $this->r1 = $this->R(1);
  151. $this->r2 = $this->R(2);
  152. }
  153. /**
  154. * Computes the stem of the word.
  155. *
  156. * @return string
  157. * The word's stem.
  158. */
  159. public function stem() {
  160. // Ignore exceptions & words that are two letters or less.
  161. if ($this->exceptions() || $this->length() <= 2) {
  162. return strtolower($this->word);
  163. }
  164. else {
  165. $this->step0();
  166. $this->step1a();
  167. $this->step1b();
  168. $this->step1c();
  169. $this->step2();
  170. $this->step3();
  171. $this->step4();
  172. $this->step5();
  173. }
  174. return strtolower($this->word);
  175. }
  176. /**
  177. * Determines whether the word is contained in our list of exceptions.
  178. *
  179. * If so, the $word property is changed to the stem listed in the exceptions.
  180. *
  181. * @return bool
  182. * TRUE if the word was an exception, FALSE otherwise.
  183. */
  184. protected function exceptions() {
  185. if (isset($this->exceptions[$this->word])) {
  186. $this->word = $this->exceptions[$this->word];
  187. return TRUE;
  188. }
  189. return FALSE;
  190. }
  191. /**
  192. * Searches for the longest among the "s" suffixes and removes it.
  193. *
  194. * Implements step 0 of the Porter2 algorithm.
  195. */
  196. protected function step0() {
  197. $found = FALSE;
  198. $checks = array("'s'", "'s", "'");
  199. foreach ($checks as $check) {
  200. if (!$found && $this->hasEnding($check)) {
  201. $this->removeEnding($check);
  202. $found = TRUE;
  203. }
  204. }
  205. }
  206. /**
  207. * Handles various suffixes, of which the longest is replaced.
  208. *
  209. * Implements step 1a of the Porter2 algorithm.
  210. */
  211. protected function step1a() {
  212. $found = FALSE;
  213. if ($this->hasEnding('sses')) {
  214. $this->removeEnding('sses');
  215. $this->addEnding('ss');
  216. $found = TRUE;
  217. }
  218. $checks = array('ied', 'ies');
  219. foreach ($checks as $check) {
  220. if (!$found && $this->hasEnding($check)) {
  221. $length = $this->length();
  222. $this->removeEnding($check);
  223. if ($length > 4) {
  224. $this->addEnding('i');
  225. }
  226. else {
  227. $this->addEnding('ie');
  228. }
  229. $found = TRUE;
  230. }
  231. }
  232. if ($this->hasEnding('us') || $this->hasEnding('ss')) {
  233. $found = TRUE;
  234. }
  235. // Delete if preceding word part has a vowel not immediately before the s.
  236. if (!$found && $this->hasEnding('s') && $this->containsVowel(substr($this->word, 0, -2))) {
  237. $this->removeEnding('s');
  238. }
  239. }
  240. /**
  241. * Handles various suffixes, of which the longest is replaced.
  242. *
  243. * Implements step 1b of the Porter2 algorithm.
  244. */
  245. protected function step1b() {
  246. $exceptions = array(
  247. 'inning',
  248. 'outing',
  249. 'canning',
  250. 'herring',
  251. 'earring',
  252. 'proceed',
  253. 'exceed',
  254. 'succeed',
  255. );
  256. if (in_array($this->word, $exceptions)) {
  257. return;
  258. }
  259. $checks = array('eedly', 'eed');
  260. foreach ($checks as $check) {
  261. if ($this->hasEnding($check)) {
  262. if ($this->r1 !== $this->length()) {
  263. $this->removeEnding($check);
  264. $this->addEnding('ee');
  265. }
  266. return;
  267. }
  268. }
  269. $checks = array('ingly', 'edly', 'ing', 'ed');
  270. $second_endings = array('at', 'bl', 'iz');
  271. foreach ($checks as $check) {
  272. // If the ending is present and the previous part contains a vowel.
  273. if ($this->hasEnding($check) && $this->containsVowel(substr($this->word, 0, -strlen($check)))) {
  274. $this->removeEnding($check);
  275. foreach ($second_endings as $ending) {
  276. if ($this->hasEnding($ending)) {
  277. $this->addEnding('e');
  278. return;
  279. }
  280. }
  281. // If the word ends with a double, remove the last letter.
  282. $found = $this->removeDoubles();
  283. // If the word is short, add e (so hop -> hope).
  284. if (!$found && ($this->isShort())) {
  285. $this->addEnding('e');
  286. }
  287. return;
  288. }
  289. }
  290. }
  291. /**
  292. * Replaces suffix y or Y with i if after non-vowel not @ word begin.
  293. *
  294. * Implements step 1c of the Porter2 algorithm.
  295. */
  296. protected function step1c() {
  297. if (($this->hasEnding('y') || $this->hasEnding('Y')) && $this->length() > 2 && !($this->isVowel($this->length() - 2))) {
  298. $this->removeEnding('y');
  299. $this->addEnding('i');
  300. }
  301. }
  302. /**
  303. * Implements step 2 of the Porter2 algorithm.
  304. */
  305. protected function step2() {
  306. $checks = array(
  307. "ization" => "ize",
  308. "iveness" => "ive",
  309. "fulness" => "ful",
  310. "ational" => "ate",
  311. "ousness" => "ous",
  312. "biliti" => "ble",
  313. "tional" => "tion",
  314. "lessli" => "less",
  315. "fulli" => "ful",
  316. "entli" => "ent",
  317. "ation" => "ate",
  318. "aliti" => "al",
  319. "iviti" => "ive",
  320. "ousli" => "ous",
  321. "alism" => "al",
  322. "abli" => "able",
  323. "anci" => "ance",
  324. "alli" => "al",
  325. "izer" => "ize",
  326. "enci" => "ence",
  327. "ator" => "ate",
  328. "bli" => "ble",
  329. "ogi" => "og",
  330. );
  331. foreach ($checks as $find => $replace) {
  332. if ($this->hasEnding($find)) {
  333. if ($this->inR1($find)) {
  334. $this->removeEnding($find);
  335. $this->addEnding($replace);
  336. }
  337. return;
  338. }
  339. }
  340. if ($this->hasEnding('li')) {
  341. if ($this->length() > 4 && $this->validLi($this->charAt(-3))) {
  342. $this->removeEnding('li');
  343. }
  344. }
  345. }
  346. /**
  347. * Implements step 3 of the Porter2 algorithm.
  348. */
  349. protected function step3() {
  350. $checks = array(
  351. 'ational' => 'ate',
  352. 'tional' => 'tion',
  353. 'alize' => 'al',
  354. 'icate' => 'ic',
  355. 'iciti' => 'ic',
  356. 'ical' => 'ic',
  357. 'ness' => '',
  358. 'ful' => '',
  359. );
  360. foreach ($checks as $find => $replace) {
  361. if ($this->hasEnding($find)) {
  362. if ($this->inR1($find)) {
  363. $this->removeEnding($find);
  364. $this->addEnding($replace);
  365. }
  366. return;
  367. }
  368. }
  369. if ($this->hasEnding('ative')) {
  370. if ($this->inR2('ative')) {
  371. $this->removeEnding('ative');
  372. }
  373. }
  374. }
  375. /**
  376. * Implements step 4 of the Porter2 algorithm.
  377. */
  378. protected function step4() {
  379. $checks = array(
  380. 'ement',
  381. 'ment',
  382. 'ance',
  383. 'ence',
  384. 'able',
  385. 'ible',
  386. 'ant',
  387. 'ent',
  388. 'ion',
  389. 'ism',
  390. 'ate',
  391. 'iti',
  392. 'ous',
  393. 'ive',
  394. 'ize',
  395. 'al',
  396. 'er',
  397. 'ic',
  398. );
  399. foreach ($checks as $check) {
  400. // Among the suffixes, if found and in R2, delete.
  401. if ($this->hasEnding($check)) {
  402. if ($this->inR2($check)) {
  403. if ($check !== 'ion' || in_array($this->charAt(-4), array('s', 't'))) {
  404. $this->removeEnding($check);
  405. }
  406. }
  407. return;
  408. }
  409. }
  410. }
  411. /**
  412. * Implements step 5 of the Porter2 algorithm.
  413. */
  414. protected function step5() {
  415. if ($this->hasEnding('e')) {
  416. // Delete if in R2, or in R1 and not preceded by a short syllable.
  417. if ($this->inR2('e') || ($this->inR1('e') && !$this->isShortSyllable($this->length() - 3))) {
  418. $this->removeEnding('e');
  419. }
  420. return;
  421. }
  422. if ($this->hasEnding('l')) {
  423. // Delete if in R2 and preceded by l.
  424. if ($this->inR2('l') && $this->charAt(-2) == 'l') {
  425. $this->removeEnding('l');
  426. }
  427. }
  428. }
  429. /**
  430. * Removes certain double consonants from the word's end.
  431. *
  432. * @return bool
  433. * TRUE if a match was found and removed, FALSE otherwise.
  434. */
  435. protected function removeDoubles() {
  436. $found = FALSE;
  437. $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
  438. foreach ($doubles as $double) {
  439. if (substr($this->word, -2) == $double) {
  440. $this->word = substr($this->word, 0, -1);
  441. $found = TRUE;
  442. break;
  443. }
  444. }
  445. return $found;
  446. }
  447. /**
  448. * Checks whether a character is a vowel.
  449. *
  450. * @param int $position
  451. * The character's position.
  452. * @param string|null $word
  453. * (optional) The word in which to check. Defaults to $this->word.
  454. * @param string[] $additional
  455. * (optional) Additional characters that should count as vowels.
  456. *
  457. * @return bool
  458. * TRUE if the character is a vowel, FALSE otherwise.
  459. */
  460. protected function isVowel($position, $word = NULL, $additional = array()) {
  461. if ($word === NULL) {
  462. $word = $this->word;
  463. }
  464. $vowels = array_merge(array('a', 'e', 'i', 'o', 'u', 'y'), $additional);
  465. return in_array($this->charAt($position, $word), $vowels);
  466. }
  467. /**
  468. * Retrieves the character at the given position.
  469. *
  470. * @param int $position
  471. * The 0-based index of the character. If a negative number is given, the
  472. * position is counted from the end of the string.
  473. * @param string|null $word
  474. * (optional) The word from which to retrieve the character. Defaults to
  475. * $this->word.
  476. *
  477. * @return string
  478. * The character at the given position, or an empty string if the given
  479. * position was illegal.
  480. */
  481. protected function charAt($position, $word = NULL) {
  482. if ($word === NULL) {
  483. $word = $this->word;
  484. }
  485. $length = strlen($word);
  486. if (abs($position) >= $length) {
  487. return '';
  488. }
  489. if ($position < 0) {
  490. $position += $length;
  491. }
  492. return $word[$position];
  493. }
  494. /**
  495. * Determines whether the word ends in a "vowel-consonant" suffix.
  496. *
  497. * Unless the word is only two characters long, it also checks that the
  498. * third-last character is neither "w", "x" nor "Y".
  499. *
  500. * @param int|null $position
  501. * (optional) If given, do not check the end of the word, but the character
  502. * at the given position, and the next one.
  503. *
  504. * @return bool
  505. * TRUE if the word has the described suffix, FALSE otherwise.
  506. */
  507. protected function isShortSyllable($position = NULL) {
  508. if ($position === NULL) {
  509. $position = $this->length() - 2;
  510. }
  511. // A vowel at the beginning of the word followed by a non-vowel.
  512. if ($position === 0) {
  513. return $this->isVowel(0) && !$this->isVowel(1);
  514. }
  515. // Vowel followed by non-vowel other than w, x, Y and preceded by
  516. // non-vowel.
  517. $additional = array('w', 'x', 'Y');
  518. return !$this->isVowel($position - 1) && $this->isVowel($position) && !$this->isVowel($position + 1, NULL, $additional);
  519. }
  520. /**
  521. * Determines whether the word is short.
  522. *
  523. * A word is called short if it ends in a short syllable and if R1 is null.
  524. *
  525. * @return bool
  526. * TRUE if the word is short, FALSE otherwise.
  527. */
  528. protected function isShort() {
  529. return $this->isShortSyllable() && $this->r1 == $this->length();
  530. }
  531. /**
  532. * Determines the start of a certain "R" region.
  533. *
  534. * R is a region after the first non-vowel following a vowel, or end of word.
  535. *
  536. * @param int $type
  537. * (optional) 1 or 2. If 2, then calculate the R after the R1.
  538. *
  539. * @return int
  540. * The R position.
  541. */
  542. protected function R($type = 1) {
  543. $inc = 1;
  544. if ($type === 2) {
  545. $inc = $this->r1;
  546. }
  547. elseif ($this->length() > 5) {
  548. $prefix_5 = substr($this->word, 0, 5);
  549. if ($prefix_5 === 'gener' || $prefix_5 === 'arsen') {
  550. return 5;
  551. }
  552. if ($this->length() > 6 && substr($this->word, 0, 6) === 'commun') {
  553. return 6;
  554. }
  555. }
  556. while ($inc <= $this->length()) {
  557. if (!$this->isVowel($inc) && $this->isVowel($inc - 1)) {
  558. $position = $inc;
  559. break;
  560. }
  561. $inc++;
  562. }
  563. if (!isset($position)) {
  564. $position = $this->length();
  565. }
  566. else {
  567. // We add one, as this is the position AFTER the first non-vowel.
  568. $position++;
  569. }
  570. return $position;
  571. }
  572. /**
  573. * Checks whether the given string is contained in R1.
  574. *
  575. * @param string $string
  576. * The string.
  577. *
  578. * @return bool
  579. * TRUE if the string is in R1, FALSE otherwise.
  580. */
  581. protected function inR1($string) {
  582. $r1 = substr($this->word, $this->r1);
  583. return strpos($r1, $string) !== FALSE;
  584. }
  585. /**
  586. * Checks whether the given string is contained in R2.
  587. *
  588. * @param string $string
  589. * The string.
  590. *
  591. * @return bool
  592. * TRUE if the string is in R2, FALSE otherwise.
  593. */
  594. protected function inR2($string) {
  595. $r2 = substr($this->word, $this->r2);
  596. return strpos($r2, $string) !== FALSE;
  597. }
  598. /**
  599. * Determines the string length of the current word.
  600. *
  601. * @return int
  602. * The string length of the current word.
  603. */
  604. protected function length() {
  605. return strlen($this->word);
  606. }
  607. /**
  608. * Checks whether the word ends with the given string.
  609. *
  610. * @param string $string
  611. * The string.
  612. *
  613. * @return bool
  614. * TRUE if the word ends with the given string, FALSE otherwise.
  615. */
  616. protected function hasEnding($string) {
  617. $length = strlen($string);
  618. if ($length > $this->length()) {
  619. return FALSE;
  620. }
  621. return (substr_compare($this->word, $string, -1 * $length, $length) === 0);
  622. }
  623. /**
  624. * Appends a given string to the current word.
  625. *
  626. * @param string $string
  627. * The ending to append.
  628. */
  629. protected function addEnding($string) {
  630. $this->word = $this->word . $string;
  631. }
  632. /**
  633. * Removes a given string from the end of the current word.
  634. *
  635. * Does not check whether the ending is actually there.
  636. *
  637. * @param string $string
  638. * The ending to remove.
  639. */
  640. protected function removeEnding($string) {
  641. $this->word = substr($this->word, 0, -strlen($string));
  642. }
  643. /**
  644. * Checks whether the given string contains a vowel.
  645. *
  646. * @param string $string
  647. * The string to check.
  648. *
  649. * @return bool
  650. * TRUE if the string contains a vowel, FALSE otherwise.
  651. */
  652. protected function containsVowel($string) {
  653. $inc = 0;
  654. $return = FALSE;
  655. while ($inc < strlen($string)) {
  656. if ($this->isVowel($inc, $string)) {
  657. $return = TRUE;
  658. break;
  659. }
  660. $inc++;
  661. }
  662. return $return;
  663. }
  664. /**
  665. * Checks whether the given string is a valid -li prefix.
  666. *
  667. * @param string $string
  668. * The string to check.
  669. *
  670. * @return bool
  671. * TRUE if the given string is a valid -li prefix, FALSE otherwise.
  672. */
  673. protected function validLi($string) {
  674. return in_array($string, array(
  675. 'c',
  676. 'd',
  677. 'e',
  678. 'g',
  679. 'h',
  680. 'k',
  681. 'm',
  682. 'n',
  683. 'r',
  684. 't',
  685. ));
  686. }
  687. }