processor_html_filter.inc 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. <?php
  2. /**
  3. * @file
  4. * Contains SearchApiHtmlFilter.
  5. */
  6. /**
  7. * Processor for stripping HTML from indexed fulltext data. Supports assigning
  8. * custom boosts for any HTML element.
  9. */
  10. // @todo Process query?
  11. class SearchApiHtmlFilter extends SearchApiAbstractProcessor {
  12. /**
  13. * @var array
  14. */
  15. protected $tags;
  16. public function __construct(SearchApiIndex $index, array $options = array()) {
  17. parent::__construct($index, $options);
  18. $this->options += array(
  19. 'title' => FALSE,
  20. 'alt' => TRUE,
  21. 'tags' => "h1 = 5\n" .
  22. "h2 = 3\n" .
  23. "h3 = 2\n" .
  24. "strong = 2\n" .
  25. "b = 2\n" .
  26. "em = 1.5\n" .
  27. 'u = 1.5',
  28. );
  29. $this->tags = drupal_parse_info_format($this->options['tags']);
  30. // Specifying empty tags doesn't make sense.
  31. unset($this->tags['br'], $this->tags['hr']);
  32. }
  33. public function configurationForm() {
  34. $form = parent::configurationForm();
  35. $form += array(
  36. 'title' => array(
  37. '#type' => 'checkbox',
  38. '#title' => t('Index title attribute'),
  39. '#description' => t('If set, the contents of title attributes will be indexed.'),
  40. '#default_value' => $this->options['title'],
  41. ),
  42. 'alt' => array(
  43. '#type' => 'checkbox',
  44. '#title' => t('Index alt attribute'),
  45. '#description' => t('If set, the alternative text of images will be indexed.'),
  46. '#default_value' => $this->options['alt'],
  47. ),
  48. 'tags' => array(
  49. '#type' => 'textarea',
  50. '#title' => t('Tag boosts'),
  51. '#description' => t('Specify special boost values for certain HTML elements, in <a href="@link">INI file format</a>. ' .
  52. 'The boost values of nested elements are multiplied, elements not mentioned will have the default boost value of 1. ' .
  53. 'Assign a boost of 0 to ignore the text content of that HTML element.',
  54. array('@link' => url('http://api.drupal.org/api/function/drupal_parse_info_format/7'))),
  55. '#default_value' => $this->options['tags'],
  56. ),
  57. );
  58. return $form;
  59. }
  60. public function configurationFormValidate(array $form, array &$values, array &$form_state) {
  61. parent::configurationFormValidate($form, $values, $form_state);
  62. if (empty($values['tags'])) {
  63. return;
  64. }
  65. $tags = drupal_parse_info_format($values['tags']);
  66. $errors = array();
  67. foreach ($tags as $key => $value) {
  68. if (is_array($value)) {
  69. $errors[] = t("Boost value for tag &lt;@tag&gt; can't be an array.", array('@tag' => $key));
  70. }
  71. elseif (!is_numeric($value)) {
  72. $errors[] = t("Boost value for tag &lt;@tag&gt; must be numeric.", array('@tag' => $key));
  73. }
  74. elseif ($value < 0) {
  75. $errors[] = t('Boost value for tag &lt;@tag&gt; must be non-negative.', array('@tag' => $key));
  76. }
  77. }
  78. if ($errors) {
  79. form_error($form['tags'], implode("<br />\n", $errors));
  80. }
  81. }
  82. protected function processFieldValue(&$value) {
  83. $text = str_replace(array('<', '>'), array(' <', '> '), $value); // Let removed tags still delimit words.
  84. if ($this->options['title']) {
  85. $text = preg_replace('/(<[-a-z_]+[^>]+)\btitle\s*=\s*("([^"]+)"|\'([^\']+)\')([^>]*>)/i', '$1 $5 $3$4 ', $text);
  86. }
  87. if ($this->options['alt']) {
  88. $text = preg_replace('/<img\b[^>]+\balt\s*=\s*("([^"]+)"|\'([^\']+)\')[^>]*>/i', ' <img>$2$3</img> ', $text);
  89. }
  90. if ($this->tags) {
  91. $text = strip_tags($text, '<' . implode('><', array_keys($this->tags)) . '>');
  92. $value = $this->parseText($text);
  93. }
  94. else {
  95. $value = html_entity_decode(strip_tags($text));
  96. // Remove any multiple or leading/trailing spaces we might have introduced.
  97. $value = preg_replace('/\s\s+/', ' ', trim($value));
  98. }
  99. }
  100. protected function parseText(&$text, $active_tag = NULL, $boost = 1) {
  101. $ret = array();
  102. while (($pos = strpos($text, '<')) !== FALSE) {
  103. if ($boost && $pos > 0) {
  104. $token = html_entity_decode(substr($text, 0, $pos), ENT_QUOTES, 'UTF-8');
  105. // Remove any multiple or leading/trailing spaces we might have introduced.
  106. $token = preg_replace('/\s\s+/', ' ', trim($token));
  107. $ret[] = array(
  108. 'value' => $token,
  109. 'score' => $boost,
  110. );
  111. }
  112. $text = substr($text, $pos + 1);
  113. if (!preg_match('#^(/?)([:_a-zA-Z][-:_a-zA-Z0-9.]*)#', $text, $m)) {
  114. continue;
  115. }
  116. $text = substr($text, strpos($text, '>') + 1);
  117. if ($m[1]) {
  118. // Closing tag.
  119. if ($active_tag && $m[2] == $active_tag) {
  120. return $ret;
  121. }
  122. }
  123. else {
  124. // Opening tag => recursive call.
  125. $inner_boost = $boost * (isset($this->tags[$m[2]]) ? $this->tags[$m[2]] : 1);
  126. $ret = array_merge($ret, $this->parseText($text, $m[2], $inner_boost));
  127. }
  128. }
  129. if ($text) {
  130. $token = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
  131. // Remove any multiple or leading/trailing spaces we might have introduced.
  132. $token = preg_replace('/\s\s+/', ' ', trim($token));
  133. $ret[] = array(
  134. 'value' => $token,
  135. 'score' => $boost,
  136. );
  137. $text = '';
  138. }
  139. return $ret;
  140. }
  141. }