123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731 |
- <?php
- /**
- * @file
- * Contains SearchApiPorterStemmer and SearchApiPorter2.
- */
- /**
- * Stems words to their roots.
- */
- class SearchApiPorterStemmer extends SearchApiAbstractProcessor {
- /**
- * Static cache for already generated stems.
- *
- * @var array
- */
- protected $stems = array();
- /**
- * {@inheritdoc}
- */
- public function configurationForm() {
- $form = parent::configurationForm();
- $args = array(
- '@algorithm' => url('http://snowball.tartarus.org/algorithms/english/stemmer.html'),
- );
- $form += array(
- 'help' => array(
- '#markup' => '<p>' . t('Optionally, provide an exclusion list to override the stemmer algorithm. (<a href="@algorithm">Read about the algorithm</a>.)', $args) . '</p>',
- ),
- 'exceptions' => array(
- '#type' => 'textarea',
- '#title' => t('Exceptions'),
- '#description' => t('Enter exceptions in the form of WORD=STEM, where "WORD" is the term entered and "STEM" is the resulting stem. List each exception on a separate line.'),
- '#default_value' => "texan=texa",
- ),
- );
- if (!empty($this->options['exceptions'])) {
- $form['exceptions']['#default_value'] = $this->options['exceptions'];
- }
- return $form;
- }
- /**
- * {@inheritdoc}
- */
- protected function process(&$value) {
- // Load custom exceptions.
- $exceptions = $this->getExceptions();
- $words = preg_split('/[^\p{L}\p{N}]+/u', $value, -1 , PREG_SPLIT_DELIM_CAPTURE);
- $stemmed = array();
- foreach ($words as $i => $word) {
- if ($i % 2 == 0 && strlen($word)) {
- if (!isset($this->stems[$word])) {
- $stem = new SearchApiPorter2($word, $exceptions);
- $this->stems[$word] = $stem->stem();
- }
- $stemmed[] = $this->stems[$word];
- }
- else {
- $stemmed[] = $word;
- }
- }
- $value = implode(' ', $stemmed);
- }
- /**
- * Retrieves the processor's configured exceptions.
- *
- * @return string[]
- * An associative array of exceptions, with words as keys and stems as their
- * replacements.
- */
- protected function getExceptions() {
- if (!empty($this->options['exceptions'])) {
- $exceptions = parse_ini_string($this->options['exceptions'], TRUE);
- return is_array($exceptions) ? $exceptions : array();
- }
- return array();
- }
- }
- /**
- * Implements the Porter2 stemming algorithm.
- *
- * @see https://github.com/markfullmer/porter2
- */
- class SearchApiPorter2 {
- /**
- * The word being stemmed.
- *
- * @var string
- */
- protected $word;
- /**
- * The R1 of the word.
- *
- * @var int
- *
- * @see http://snowball.tartarus.org/texts/r1r2.html.
- */
- protected $r1;
- /**
- * The R2 of the word.
- *
- * @var int
- *
- * @see http://snowball.tartarus.org/texts/r1r2.html.
- */
- protected $r2;
- /**
- * List of exceptions to be used.
- *
- * @var string[]
- */
- protected $exceptions = array();
- /**
- * Constructs a SearchApiPorter2 object.
- *
- * @param string $word
- * The word to stem.
- * @param string[] $custom_exceptions
- * (optional) A custom list of exceptions.
- */
- public function __construct($word, $custom_exceptions = array()) {
- $this->word = $word;
- $this->exceptions = $custom_exceptions + array(
- 'skis' => 'ski',
- 'skies' => 'sky',
- 'dying' => 'die',
- 'lying' => 'lie',
- 'tying' => 'tie',
- 'idly' => 'idl',
- 'gently' => 'gentl',
- 'ugly' => 'ugli',
- 'early' => 'earli',
- 'only' => 'onli',
- 'singly' => 'singl',
- 'sky' => 'sky',
- 'news' => 'news',
- 'howe' => 'howe',
- 'atlas' => 'atlas',
- 'cosmos' => 'cosmos',
- 'bias' => 'bias',
- 'andes' => 'andes',
- );
- // Set initial y, or y after a vowel, to Y.
- $inc = 0;
- while ($inc <= $this->length()) {
- if (substr($this->word, $inc, 1) === 'y' && ($inc == 0 || $this->isVowel($inc - 1))) {
- $this->word = substr_replace($this->word, 'Y', $inc, 1);
- }
- $inc++;
- }
- // Establish the regions R1 and R2. See function R().
- $this->r1 = $this->R(1);
- $this->r2 = $this->R(2);
- }
- /**
- * Computes the stem of the word.
- *
- * @return string
- * The word's stem.
- */
- public function stem() {
- // Ignore exceptions & words that are two letters or less.
- if ($this->exceptions() || $this->length() <= 2) {
- return strtolower($this->word);
- }
- else {
- $this->step0();
- $this->step1a();
- $this->step1b();
- $this->step1c();
- $this->step2();
- $this->step3();
- $this->step4();
- $this->step5();
- }
- return strtolower($this->word);
- }
- /**
- * Determines whether the word is contained in our list of exceptions.
- *
- * If so, the $word property is changed to the stem listed in the exceptions.
- *
- * @return bool
- * TRUE if the word was an exception, FALSE otherwise.
- */
- protected function exceptions() {
- if (isset($this->exceptions[$this->word])) {
- $this->word = $this->exceptions[$this->word];
- return TRUE;
- }
- return FALSE;
- }
- /**
- * Searches for the longest among the "s" suffixes and removes it.
- *
- * Implements step 0 of the Porter2 algorithm.
- */
- protected function step0() {
- $found = FALSE;
- $checks = array("'s'", "'s", "'");
- foreach ($checks as $check) {
- if (!$found && $this->hasEnding($check)) {
- $this->removeEnding($check);
- $found = TRUE;
- }
- }
- }
- /**
- * Handles various suffixes, of which the longest is replaced.
- *
- * Implements step 1a of the Porter2 algorithm.
- */
- protected function step1a() {
- $found = FALSE;
- if ($this->hasEnding('sses')) {
- $this->removeEnding('sses');
- $this->addEnding('ss');
- $found = TRUE;
- }
- $checks = array('ied', 'ies');
- foreach ($checks as $check) {
- if (!$found && $this->hasEnding($check)) {
- $length = $this->length();
- $this->removeEnding($check);
- if ($length > 4) {
- $this->addEnding('i');
- }
- else {
- $this->addEnding('ie');
- }
- $found = TRUE;
- }
- }
- if ($this->hasEnding('us') || $this->hasEnding('ss')) {
- $found = TRUE;
- }
- // Delete if preceding word part has a vowel not immediately before the s.
- if (!$found && $this->hasEnding('s') && $this->containsVowel(substr($this->word, 0, -2))) {
- $this->removeEnding('s');
- }
- }
- /**
- * Handles various suffixes, of which the longest is replaced.
- *
- * Implements step 1b of the Porter2 algorithm.
- */
- protected function step1b() {
- $exceptions = array(
- 'inning',
- 'outing',
- 'canning',
- 'herring',
- 'earring',
- 'proceed',
- 'exceed',
- 'succeed',
- );
- if (in_array($this->word, $exceptions)) {
- return;
- }
- $checks = array('eedly', 'eed');
- foreach ($checks as $check) {
- if ($this->hasEnding($check)) {
- if ($this->r1 !== $this->length()) {
- $this->removeEnding($check);
- $this->addEnding('ee');
- }
- return;
- }
- }
- $checks = array('ingly', 'edly', 'ing', 'ed');
- $second_endings = array('at', 'bl', 'iz');
- foreach ($checks as $check) {
- // If the ending is present and the previous part contains a vowel.
- if ($this->hasEnding($check) && $this->containsVowel(substr($this->word, 0, -strlen($check)))) {
- $this->removeEnding($check);
- foreach ($second_endings as $ending) {
- if ($this->hasEnding($ending)) {
- $this->addEnding('e');
- return;
- }
- }
- // If the word ends with a double, remove the last letter.
- $found = $this->removeDoubles();
- // If the word is short, add e (so hop -> hope).
- if (!$found && ($this->isShort())) {
- $this->addEnding('e');
- }
- return;
- }
- }
- }
- /**
- * Replaces suffix y or Y with i if after non-vowel not @ word begin.
- *
- * Implements step 1c of the Porter2 algorithm.
- */
- protected function step1c() {
- if (($this->hasEnding('y') || $this->hasEnding('Y')) && $this->length() > 2 && !($this->isVowel($this->length() - 2))) {
- $this->removeEnding('y');
- $this->addEnding('i');
- }
- }
- /**
- * Implements step 2 of the Porter2 algorithm.
- */
- protected function step2() {
- $checks = array(
- "ization" => "ize",
- "iveness" => "ive",
- "fulness" => "ful",
- "ational" => "ate",
- "ousness" => "ous",
- "biliti" => "ble",
- "tional" => "tion",
- "lessli" => "less",
- "fulli" => "ful",
- "entli" => "ent",
- "ation" => "ate",
- "aliti" => "al",
- "iviti" => "ive",
- "ousli" => "ous",
- "alism" => "al",
- "abli" => "able",
- "anci" => "ance",
- "alli" => "al",
- "izer" => "ize",
- "enci" => "ence",
- "ator" => "ate",
- "bli" => "ble",
- "ogi" => "og",
- );
- foreach ($checks as $find => $replace) {
- if ($this->hasEnding($find)) {
- if ($this->inR1($find)) {
- $this->removeEnding($find);
- $this->addEnding($replace);
- }
- return;
- }
- }
- if ($this->hasEnding('li')) {
- if ($this->length() > 4 && $this->validLi($this->charAt(-3))) {
- $this->removeEnding('li');
- }
- }
- }
- /**
- * Implements step 3 of the Porter2 algorithm.
- */
- protected function step3() {
- $checks = array(
- 'ational' => 'ate',
- 'tional' => 'tion',
- 'alize' => 'al',
- 'icate' => 'ic',
- 'iciti' => 'ic',
- 'ical' => 'ic',
- 'ness' => '',
- 'ful' => '',
- );
- foreach ($checks as $find => $replace) {
- if ($this->hasEnding($find)) {
- if ($this->inR1($find)) {
- $this->removeEnding($find);
- $this->addEnding($replace);
- }
- return;
- }
- }
- if ($this->hasEnding('ative')) {
- if ($this->inR2('ative')) {
- $this->removeEnding('ative');
- }
- }
- }
- /**
- * Implements step 4 of the Porter2 algorithm.
- */
- protected function step4() {
- $checks = array(
- 'ement',
- 'ment',
- 'ance',
- 'ence',
- 'able',
- 'ible',
- 'ant',
- 'ent',
- 'ion',
- 'ism',
- 'ate',
- 'iti',
- 'ous',
- 'ive',
- 'ize',
- 'al',
- 'er',
- 'ic',
- );
- foreach ($checks as $check) {
- // Among the suffixes, if found and in R2, delete.
- if ($this->hasEnding($check)) {
- if ($this->inR2($check)) {
- if ($check !== 'ion' || in_array($this->charAt(-4), array('s', 't'))) {
- $this->removeEnding($check);
- }
- }
- return;
- }
- }
- }
- /**
- * Implements step 5 of the Porter2 algorithm.
- */
- protected function step5() {
- if ($this->hasEnding('e')) {
- // Delete if in R2, or in R1 and not preceded by a short syllable.
- if ($this->inR2('e') || ($this->inR1('e') && !$this->isShortSyllable($this->length() - 3))) {
- $this->removeEnding('e');
- }
- return;
- }
- if ($this->hasEnding('l')) {
- // Delete if in R2 and preceded by l.
- if ($this->inR2('l') && $this->charAt(-2) == 'l') {
- $this->removeEnding('l');
- }
- }
- }
- /**
- * Removes certain double consonants from the word's end.
- *
- * @return bool
- * TRUE if a match was found and removed, FALSE otherwise.
- */
- protected function removeDoubles() {
- $found = FALSE;
- $doubles = array('bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt');
- foreach ($doubles as $double) {
- if (substr($this->word, -2) == $double) {
- $this->word = substr($this->word, 0, -1);
- $found = TRUE;
- break;
- }
- }
- return $found;
- }
- /**
- * Checks whether a character is a vowel.
- *
- * @param int $position
- * The character's position.
- * @param string|null $word
- * (optional) The word in which to check. Defaults to $this->word.
- * @param string[] $additional
- * (optional) Additional characters that should count as vowels.
- *
- * @return bool
- * TRUE if the character is a vowel, FALSE otherwise.
- */
- protected function isVowel($position, $word = NULL, $additional = array()) {
- if ($word === NULL) {
- $word = $this->word;
- }
- $vowels = array_merge(array('a', 'e', 'i', 'o', 'u', 'y'), $additional);
- return in_array($this->charAt($position, $word), $vowels);
- }
- /**
- * Retrieves the character at the given position.
- *
- * @param int $position
- * The 0-based index of the character. If a negative number is given, the
- * position is counted from the end of the string.
- * @param string|null $word
- * (optional) The word from which to retrieve the character. Defaults to
- * $this->word.
- *
- * @return string
- * The character at the given position, or an empty string if the given
- * position was illegal.
- */
- protected function charAt($position, $word = NULL) {
- if ($word === NULL) {
- $word = $this->word;
- }
- $length = strlen($word);
- if (abs($position) >= $length) {
- return '';
- }
- if ($position < 0) {
- $position += $length;
- }
- return $word[$position];
- }
- /**
- * Determines whether the word ends in a "vowel-consonant" suffix.
- *
- * Unless the word is only two characters long, it also checks that the
- * third-last character is neither "w", "x" nor "Y".
- *
- * @param int|null $position
- * (optional) If given, do not check the end of the word, but the character
- * at the given position, and the next one.
- *
- * @return bool
- * TRUE if the word has the described suffix, FALSE otherwise.
- */
- protected function isShortSyllable($position = NULL) {
- if ($position === NULL) {
- $position = $this->length() - 2;
- }
- // A vowel at the beginning of the word followed by a non-vowel.
- if ($position === 0) {
- return $this->isVowel(0) && !$this->isVowel(1);
- }
- // Vowel followed by non-vowel other than w, x, Y and preceded by
- // non-vowel.
- $additional = array('w', 'x', 'Y');
- return !$this->isVowel($position - 1) && $this->isVowel($position) && !$this->isVowel($position + 1, NULL, $additional);
- }
- /**
- * Determines whether the word is short.
- *
- * A word is called short if it ends in a short syllable and if R1 is null.
- *
- * @return bool
- * TRUE if the word is short, FALSE otherwise.
- */
- protected function isShort() {
- return $this->isShortSyllable() && $this->r1 == $this->length();
- }
- /**
- * Determines the start of a certain "R" region.
- *
- * R is a region after the first non-vowel following a vowel, or end of word.
- *
- * @param int $type
- * (optional) 1 or 2. If 2, then calculate the R after the R1.
- *
- * @return int
- * The R position.
- */
- protected function R($type = 1) {
- $inc = 1;
- if ($type === 2) {
- $inc = $this->r1;
- }
- elseif ($this->length() > 5) {
- $prefix_5 = substr($this->word, 0, 5);
- if ($prefix_5 === 'gener' || $prefix_5 === 'arsen') {
- return 5;
- }
- if ($this->length() > 6 && substr($this->word, 0, 6) === 'commun') {
- return 6;
- }
- }
- while ($inc <= $this->length()) {
- if (!$this->isVowel($inc) && $this->isVowel($inc - 1)) {
- $position = $inc;
- break;
- }
- $inc++;
- }
- if (!isset($position)) {
- $position = $this->length();
- }
- else {
- // We add one, as this is the position AFTER the first non-vowel.
- $position++;
- }
- return $position;
- }
- /**
- * Checks whether the given string is contained in R1.
- *
- * @param string $string
- * The string.
- *
- * @return bool
- * TRUE if the string is in R1, FALSE otherwise.
- */
- protected function inR1($string) {
- $r1 = substr($this->word, $this->r1);
- return strpos($r1, $string) !== FALSE;
- }
- /**
- * Checks whether the given string is contained in R2.
- *
- * @param string $string
- * The string.
- *
- * @return bool
- * TRUE if the string is in R2, FALSE otherwise.
- */
- protected function inR2($string) {
- $r2 = substr($this->word, $this->r2);
- return strpos($r2, $string) !== FALSE;
- }
- /**
- * Determines the string length of the current word.
- *
- * @return int
- * The string length of the current word.
- */
- protected function length() {
- return strlen($this->word);
- }
- /**
- * Checks whether the word ends with the given string.
- *
- * @param string $string
- * The string.
- *
- * @return bool
- * TRUE if the word ends with the given string, FALSE otherwise.
- */
- protected function hasEnding($string) {
- $length = strlen($string);
- if ($length > $this->length()) {
- return FALSE;
- }
- return (substr_compare($this->word, $string, -1 * $length, $length) === 0);
- }
- /**
- * Appends a given string to the current word.
- *
- * @param string $string
- * The ending to append.
- */
- protected function addEnding($string) {
- $this->word = $this->word . $string;
- }
- /**
- * Removes a given string from the end of the current word.
- *
- * Does not check whether the ending is actually there.
- *
- * @param string $string
- * The ending to remove.
- */
- protected function removeEnding($string) {
- $this->word = substr($this->word, 0, -strlen($string));
- }
- /**
- * Checks whether the given string contains a vowel.
- *
- * @param string $string
- * The string to check.
- *
- * @return bool
- * TRUE if the string contains a vowel, FALSE otherwise.
- */
- protected function containsVowel($string) {
- $inc = 0;
- $return = FALSE;
- while ($inc < strlen($string)) {
- if ($this->isVowel($inc, $string)) {
- $return = TRUE;
- break;
- }
- $inc++;
- }
- return $return;
- }
- /**
- * Checks whether the given string is a valid -li prefix.
- *
- * @param string $string
- * The string to check.
- *
- * @return bool
- * TRUE if the given string is a valid -li prefix, FALSE otherwise.
- */
- protected function validLi($string) {
- return in_array($string, array(
- 'c',
- 'd',
- 'e',
- 'g',
- 'h',
- 'k',
- 'm',
- 'n',
- 'r',
- 't',
- ));
- }
- }
|