processor_stopwords.inc 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. <?php
  2. /**
  3. * Processor for removing stopwords from index and search terms.
  4. */
  5. class SearchApiStopWords extends SearchApiAbstractProcessor {
  6. public function configurationForm() {
  7. $form = parent::configurationForm();
  8. $form += array(
  9. 'help' => array(
  10. '#markup' => '<p>' . t('Provide a stopwords file or enter the words in this form. If you do both, both will be used. Read about !stopwords.', array('!stopwords' => l(t('stop words'), "http://en.wikipedia.org/wiki/Stop_words"))) . '</p>',
  11. ),
  12. 'file' => array(
  13. '#type' => 'textfield',
  14. '#title' => t('Stopwords file URI'),
  15. '#title' => t('Enter the URI of your stopwords.txt file'),
  16. '#description' => t('This must be a stream-type description like <code>public://stopwords/stopwords.txt</code> or <code>http://example.com/stopwords.txt</code> or <code>private://stopwords.txt</code>.'),
  17. ),
  18. 'stopwords' => array(
  19. '#type' => 'textarea',
  20. '#title' => t('Stopwords'),
  21. '#description' => t('Enter a space and/or linebreak separated list of stopwords that will be removed from content before it is indexed and from search terms before searching.'),
  22. '#default_value' => t("but\ndid\nthe this that those\netc"),
  23. ),
  24. );
  25. if (!empty($this->options)) {
  26. $form['file']['#default_value'] = $this->options['file'];
  27. $form['stopwords']['#default_value'] = $this->options['stopwords'];
  28. }
  29. return $form;
  30. }
  31. public function configurationFormValidate(array $form, array &$values, array &$form_state) {
  32. parent::configurationFormValidate($form, $values, $form_state);
  33. $stopwords = trim($values['stopwords']);
  34. $uri = $values['file'];
  35. if (empty($stopwords) && empty($uri)) {
  36. $el = $form['file'];
  37. form_error($el, $el['#title'] . ': ' . t('At stopwords file or words are required.'));
  38. }
  39. if (!empty($uri) && !file_get_contents($uri)) {
  40. $el = $form['file'];
  41. form_error($el, t('Stopwords file') . ': ' . t('The file %uri is not readable or does not exist.', array('%uri' => $uri)));
  42. }
  43. }
  44. public function process(&$value) {
  45. $stopwords = $this->getStopWords();
  46. if (empty($stopwords)) {
  47. return;
  48. }
  49. $words = preg_split('/\s+/', $value);
  50. foreach ($words as $sub_key => $sub_value) {
  51. if (isset($stopwords[$sub_value])) {
  52. unset($words[$sub_key]);
  53. $this->ignored[] = $sub_value;
  54. }
  55. }
  56. $value = implode(' ', $words);
  57. }
  58. public function postprocessSearchResults(array &$response, SearchApiQuery $query) {
  59. if (isset($this->ignored)) {
  60. if (isset($response['ignored'])) {
  61. $response['ignored'] = array_merge($response['ignored'], $this->ignored);
  62. }
  63. else $response['ignored'] = $this->ignored;
  64. }
  65. }
  66. /**
  67. * @return
  68. * An array whose keys are the stopwords set in either the file or the text
  69. * field.
  70. */
  71. protected function getStopWords() {
  72. if (isset($this->stopwords)) {
  73. return $this->stopwords;
  74. }
  75. $file_words = $form_words = array();
  76. if (!empty($this->options['file']) && $stopwords_file = file_get_contents($this->options['file'])) {
  77. $file_words = preg_split('/\s+/', $stopwords_file);
  78. }
  79. if (!empty($this->options['stopwords'])) {
  80. $form_words = preg_split('/\s+/', $this->options['stopwords']);
  81. }
  82. $this->stopwords = array_flip(array_merge($file_words, $form_words));
  83. return $this->stopwords;
  84. }
  85. }