processor_html_filter.inc 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. <?php
  2. /**
  3. * Processor for stripping HTML from indexed fulltext data. Supports assigning
  4. * custom boosts for any HTML element.
  5. */
  6. // @todo Process query?
  7. class SearchApiHtmlFilter extends SearchApiAbstractProcessor {
  8. /**
  9. * @var array
  10. */
  11. protected $tags;
  12. public function __construct(SearchApiIndex $index, array $options = array()) {
  13. parent::__construct($index, $options);
  14. $this->options += array(
  15. 'title' => FALSE,
  16. 'alt' => TRUE,
  17. 'tags' => "h1 = 5\n" .
  18. "h2 = 3\n" .
  19. "h3 = 2\n" .
  20. "strong = 2\n" .
  21. "b = 2\n" .
  22. "em = 1.5\n" .
  23. 'u = 1.5',
  24. );
  25. $this->tags = drupal_parse_info_format($this->options['tags']);
  26. // Specifying empty tags doesn't make sense.
  27. unset($this->tags['br'], $this->tags['hr']);
  28. }
  29. public function configurationForm() {
  30. $form = parent::configurationForm();
  31. $form += array(
  32. 'title' => array(
  33. '#type' => 'checkbox',
  34. '#title' => t('Index title attribute'),
  35. '#description' => t('If set, the contents of title attributes will be indexed.'),
  36. '#default_value' => $this->options['title'],
  37. ),
  38. 'alt' => array(
  39. '#type' => 'checkbox',
  40. '#title' => t('Index alt attribute'),
  41. '#description' => t('If set, the alternative text of images will be indexed.'),
  42. '#default_value' => $this->options['alt'],
  43. ),
  44. 'tags' => array(
  45. '#type' => 'textarea',
  46. '#title' => t('Tag boosts'),
  47. '#description' => t('Specify special boost values for certain HTML elements, in <a href="@link">INI file format</a>. ' .
  48. 'The boost values of nested elements are multiplied, elements not mentioned will have the default boost value of 1. ' .
  49. 'Assign a boost of 0 to ignore the text content of that HTML element.',
  50. array('@link' => url('http://api.drupal.org/api/function/drupal_parse_info_format/7'))),
  51. '#default_value' => $this->options['tags'],
  52. ),
  53. );
  54. return $form;
  55. }
  56. public function configurationFormValidate(array $form, array &$values, array &$form_state) {
  57. parent::configurationFormValidate($form, $values, $form_state);
  58. if (empty($values['tags'])) {
  59. return;
  60. }
  61. $tags = drupal_parse_info_format($values['tags']);
  62. $errors = array();
  63. foreach ($tags as $key => $value) {
  64. if (is_array($value)) {
  65. $errors[] = t("Boost value for tag &lt;@tag&gt; can't be an array.", array('@tag' => $key));
  66. }
  67. elseif (!is_numeric($value)) {
  68. $errors[] = t("Boost value for tag &lt;@tag&gt; must be numeric.", array('@tag' => $key));
  69. }
  70. elseif ($value < 0) {
  71. $errors[] = t('Boost value for tag &lt;@tag&gt; must be non-negative.', array('@tag' => $key));
  72. }
  73. }
  74. if ($errors) {
  75. form_error($form['tags'], implode("<br />\n", $errors));
  76. }
  77. }
  78. protected function processFieldValue(&$value) {
  79. $text = str_replace(array('<', '>'), array(' <', '> '), $value); // Let removed tags still delimit words.
  80. if ($this->options['title']) {
  81. $text = preg_replace('/(<[-a-z_]+[^>]+)\btitle\s*=\s*("([^"]+)"|\'([^\']+)\')([^>]*>)/i', '$1 $5 $3$4 ', $text);
  82. }
  83. if ($this->options['alt']) {
  84. $text = preg_replace('/<img\b[^>]+\balt\s*=\s*("([^"]+)"|\'([^\']+)\')[^>]*>/i', ' <img>$2$3</img> ', $text);
  85. }
  86. if ($this->tags) {
  87. $text = strip_tags($text, '<' . implode('><', array_keys($this->tags)) . '>');
  88. $value = $this->parseText($text);
  89. }
  90. else {
  91. $value = strip_tags($text);
  92. }
  93. }
  94. protected function parseText(&$text, $active_tag = NULL, $boost = 1) {
  95. $ret = array();
  96. while (($pos = strpos($text, '<')) !== FALSE) {
  97. if ($boost && $pos > 0) {
  98. $ret[] = array(
  99. 'value' => html_entity_decode(substr($text, 0, $pos), ENT_QUOTES, 'UTF-8'),
  100. 'score' => $boost,
  101. );
  102. }
  103. $text = substr($text, $pos + 1);
  104. preg_match('#^(/?)([-:_a-zA-Z]+)#', $text, $m);
  105. $text = substr($text, strpos($text, '>') + 1);
  106. if ($m[1]) {
  107. // Closing tag.
  108. if ($active_tag && $m[2] == $active_tag) {
  109. return $ret;
  110. }
  111. }
  112. else {
  113. // Opening tag => recursive call.
  114. $inner_boost = $boost * (isset($this->tags[$m[2]]) ? $this->tags[$m[2]] : 1);
  115. $ret = array_merge($ret, $this->parseText($text, $m[2], $inner_boost));
  116. }
  117. }
  118. if ($text) {
  119. $ret[] = array(
  120. 'value' => html_entity_decode($text, ENT_QUOTES, 'UTF-8'),
  121. 'score' => $boost,
  122. );
  123. $text = '';
  124. }
  125. return $ret;
  126. }
  127. }