processor_stopwords.inc 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. <?php
  2. /**
  3. * @file
  4. * Contains SearchApiStopWords.
  5. */
  6. /**
  7. * Processor for removing stopwords from index and search terms.
  8. */
  9. class SearchApiStopWords extends SearchApiAbstractProcessor {
  10. /**
  11. * Holds all words ignored for the last query.
  12. *
  13. * @var array
  14. */
  15. protected $ignored = array();
  16. public function configurationForm() {
  17. $form = parent::configurationForm();
  18. $form += array(
  19. 'help' => array(
  20. '#markup' => '<p>' . t('Provide a stopwords file or enter the words in this form. If you do both, both will be used. Read about !stopwords.', array('!stopwords' => l(t('stop words'), "http://en.wikipedia.org/wiki/Stop_words"))) . '</p>',
  21. ),
  22. 'file' => array(
  23. '#type' => 'textfield',
  24. '#title' => t('Stopwords file'),
  25. '#description' => t('This must be a stream-type description like <code>public://stopwords/stopwords.txt</code> or <code>http://example.com/stopwords.txt</code> or <code>private://stopwords.txt</code>.'),
  26. ),
  27. 'stopwords' => array(
  28. '#type' => 'textarea',
  29. '#title' => t('Stopwords'),
  30. '#description' => t('Enter a space and/or linebreak separated list of stopwords that will be removed from content before it is indexed and from search terms before searching.'),
  31. '#default_value' => t("but\ndid\nthe this that those\netc"),
  32. ),
  33. );
  34. if (!empty($this->options)) {
  35. $form['file']['#default_value'] = $this->options['file'];
  36. $form['stopwords']['#default_value'] = $this->options['stopwords'];
  37. }
  38. return $form;
  39. }
  40. public function configurationFormValidate(array $form, array &$values, array &$form_state) {
  41. parent::configurationFormValidate($form, $values, $form_state);
  42. $uri = $values['file'];
  43. if (!empty($uri) && !@file_get_contents($uri)) {
  44. $el = $form['file'];
  45. form_error($el, t('Stopwords file') . ': ' . t('The file %uri is not readable or does not exist.', array('%uri' => $uri)));
  46. }
  47. }
  48. public function process(&$value) {
  49. $stopwords = $this->getStopWords();
  50. if (empty($stopwords) || !is_string($value)) {
  51. return;
  52. }
  53. $words = preg_split('/\s+/', $value);
  54. foreach ($words as $sub_key => $sub_value) {
  55. if (isset($stopwords[$sub_value])) {
  56. unset($words[$sub_key]);
  57. $this->ignored[] = $sub_value;
  58. }
  59. }
  60. $value = implode(' ', $words);
  61. }
  62. public function preprocessSearchQuery(SearchApiQuery $query) {
  63. $this->ignored = array();
  64. parent::preprocessSearchQuery($query);
  65. }
  66. public function postprocessSearchResults(array &$response, SearchApiQuery $query) {
  67. if ($this->ignored) {
  68. if (isset($response['ignored'])) {
  69. $response['ignored'] = array_merge($response['ignored'], $this->ignored);
  70. }
  71. else {
  72. $response['ignored'] = $this->ignored;
  73. }
  74. }
  75. }
  76. /**
  77. * Retrieves the processor's configured stopwords.
  78. *
  79. * @return array
  80. * An array whose keys are the stopwords set in either the file or the text
  81. * field.
  82. */
  83. protected function getStopWords() {
  84. if (isset($this->stopwords)) {
  85. return $this->stopwords;
  86. }
  87. $file_words = $form_words = array();
  88. if (!empty($this->options['file']) && $stopwords_file = file_get_contents($this->options['file'])) {
  89. $file_words = preg_split('/\s+/', $stopwords_file);
  90. }
  91. if (!empty($this->options['stopwords'])) {
  92. $form_words = preg_split('/\s+/', $this->options['stopwords']);
  93. }
  94. $this->stopwords = array_flip(array_merge($file_words, $form_words));
  95. return $this->stopwords;
  96. }
  97. }