processor_highlight.inc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. <?php
  2. /**
  3. * @file
  4. * Contains the SearchApiHighlight class.
  5. */
  6. /**
  7. * Processor for highlighting search results.
  8. */
  9. class SearchApiHighlight extends SearchApiAbstractProcessor {
  10. /**
  11. * PREG regular expression for a word boundary.
  12. *
  13. * We highlight around non-indexable or CJK characters.
  14. *
  15. * @var string
  16. */
  17. protected static $boundary;
  18. /**
  19. * PREG regular expression for splitting words.
  20. *
  21. * We highlight around non-indexable or CJK characters.
  22. *
  23. * @var string
  24. */
  25. protected static $split;
  26. /**
  27. * {@inheritdoc}
  28. */
  29. public function __construct(SearchApiIndex $index, array $options = array()) {
  30. parent::__construct($index, $options);
  31. $cjk = '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' .
  32. '\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' .
  33. '\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' .
  34. '\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' .
  35. '\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}';
  36. self::$boundary = '(?:(?<=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . '])|(?=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . ']))';
  37. self::$split = '/[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . ']+/iu';
  38. }
  39. /**
  40. * {@inheritdoc}
  41. */
  42. public function configurationForm() {
  43. $this->options += array(
  44. 'prefix' => '<strong>',
  45. 'suffix' => '</strong>',
  46. 'excerpt' => TRUE,
  47. 'excerpt_length' => 256,
  48. 'highlight' => 'always',
  49. );
  50. $form['prefix'] = array(
  51. '#type' => 'textfield',
  52. '#title' => t('Highlighting prefix'),
  53. '#description' => t('Text/HTML that will be prepended to all occurrences of search keywords in highlighted text.'),
  54. '#default_value' => $this->options['prefix'],
  55. );
  56. $form['suffix'] = array(
  57. '#type' => 'textfield',
  58. '#title' => t('Highlighting suffix'),
  59. '#description' => t('Text/HTML that will be appended to all occurrences of search keywords in highlighted text.'),
  60. '#default_value' => $this->options['suffix'],
  61. );
  62. $form['excerpt'] = array(
  63. '#type' => 'checkbox',
  64. '#title' => t('Create excerpt'),
  65. '#description' => t('When enabled, an excerpt will be created for searches with keywords, containing all occurrences of keywords in a fulltext field.'),
  66. '#default_value' => $this->options['excerpt'],
  67. );
  68. $form['excerpt_length'] = array(
  69. '#type' => 'textfield',
  70. '#title' => t('Excerpt length'),
  71. '#description' => t('The requested length of the excerpt, in characters.'),
  72. '#default_value' => $this->options['excerpt_length'],
  73. '#element_validate' => array('element_validate_integer_positive'),
  74. '#states' => array(
  75. 'visible' => array(
  76. '#edit-processors-search-api-highlighting-settings-excerpt' => array(
  77. 'checked' => TRUE,
  78. ),
  79. ),
  80. ),
  81. );
  82. $form['highlight'] = array(
  83. '#type' => 'select',
  84. '#title' => t('Highlight returned field data'),
  85. '#description' => t('Select whether returned fields should be highlighted.'),
  86. '#options' => array(
  87. 'always' => t('Always'),
  88. 'server' => t('If the server returns fields'),
  89. 'never' => t('Never'),
  90. ),
  91. '#default_value' => $this->options['highlight'],
  92. );
  93. return $form;
  94. }
  95. /**
  96. * {@inheritdoc}
  97. */
  98. public function configurationFormValidate(array $form, array &$values, array &$form_state) {
  99. // Overridden so $form['fields'] is not checked.
  100. }
  101. /**
  102. * {@inheritdoc}
  103. */
  104. public function postprocessSearchResults(array &$response, SearchApiQuery $query) {
  105. if (!$response['result count'] || !($keys = $this->getKeywords($query))) {
  106. return;
  107. }
  108. foreach ($response['results'] as $id => &$result) {
  109. if ($this->options['excerpt']) {
  110. $text = array();
  111. $fields = $this->getFulltextFields($response['results'], $id);
  112. foreach ($fields as $data) {
  113. if (is_array($data)) {
  114. $text = array_merge($text, $data);
  115. }
  116. else {
  117. $text[] = $data;
  118. }
  119. }
  120. $result['excerpt'] = $this->createExcerpt(implode("\n\n", $text), $keys);
  121. }
  122. if ($this->options['highlight'] != 'never') {
  123. $fields = $this->getFulltextFields($response['results'], $id, $this->options['highlight'] == 'always');
  124. foreach ($fields as $field => $data) {
  125. if (is_array($data)) {
  126. foreach ($data as $i => $text) {
  127. $result['fields'][$field][$i] = $this->highlightField($text, $keys);
  128. }
  129. }
  130. else {
  131. $result['fields'][$field] = $this->highlightField($data, $keys);
  132. }
  133. }
  134. }
  135. }
  136. }
  137. /**
  138. * Retrieves the fulltext data of a result.
  139. *
  140. * @param array $results
  141. * All results returned in the search, by reference.
  142. * @param int|string $i
  143. * The index in the results array of the result whose data should be
  144. * returned.
  145. * @param bool $load
  146. * TRUE if the item should be loaded if necessary, FALSE if only fields
  147. * already returned in the results should be used.
  148. *
  149. * @return array
  150. * An array containing fulltext field names mapped to the text data
  151. * contained in them for the given result.
  152. */
  153. protected function getFulltextFields(array &$results, $i, $load = TRUE) {
  154. global $language;
  155. $data = array();
  156. $result = &$results[$i];
  157. // Act as if $load is TRUE if we have a loaded item.
  158. $load |= !empty($result['entity']);
  159. $result += array('fields' => array());
  160. $fulltext_fields = $this->index->getFulltextFields();
  161. // We only need detailed fields data if $load is TRUE.
  162. $fields = $load ? $this->index->getFields() : array();
  163. $needs_extraction = array();
  164. foreach ($fulltext_fields as $field) {
  165. if (array_key_exists($field, $result['fields'])) {
  166. $data[$field] = $result['fields'][$field];
  167. }
  168. elseif ($load) {
  169. $needs_extraction[$field] = $fields[$field];
  170. }
  171. }
  172. if (!$needs_extraction) {
  173. return $data;
  174. }
  175. if (empty($result['entity'])) {
  176. $items = $this->index->loadItems(array_keys($results));
  177. foreach ($items as $id => $item) {
  178. $results[$id]['entity'] = $item;
  179. }
  180. }
  181. // If we still don't have a loaded item, we should stop trying.
  182. if (empty($result['entity'])) {
  183. return $data;
  184. }
  185. $wrapper = $this->index->entityWrapper($result['entity'], FALSE);
  186. $wrapper->language($language->language);
  187. $extracted = search_api_extract_fields($wrapper, $needs_extraction);
  188. foreach ($extracted as $field => $info) {
  189. if (isset($info['value'])) {
  190. $data[$field] = $info['value'];
  191. }
  192. }
  193. return $data;
  194. }
  195. /**
  196. * Extracts the positive keywords used in a search query.
  197. *
  198. * @param SearchApiQuery $query
  199. * The query from which to extract the keywords.
  200. *
  201. * @return array
  202. * An array of all unique positive keywords used in the query.
  203. */
  204. protected function getKeywords(SearchApiQuery $query) {
  205. $keys = $query->getKeys();
  206. if (!$keys) {
  207. return array();
  208. }
  209. if (is_array($keys)) {
  210. return $this->flattenKeysArray($keys);
  211. }
  212. $keywords = preg_split(self::$split, $keys);
  213. // Assure there are no duplicates. (This is actually faster than
  214. // array_unique() by a factor of 3 to 4.)
  215. $keywords = drupal_map_assoc(array_filter($keywords));
  216. // Remove quotes from keywords.
  217. foreach ($keywords as $key) {
  218. $keywords[$key] = trim($key, "'\"");
  219. }
  220. return drupal_map_assoc(array_filter($keywords));
  221. }
  222. /**
  223. * Extracts the positive keywords from a keys array.
  224. *
  225. * @param array $keys
  226. * A search keys array, as specified by SearchApiQueryInterface::getKeys().
  227. *
  228. * @return array
  229. * An array of all unique positive keywords contained in the keys.
  230. */
  231. protected function flattenKeysArray(array $keys) {
  232. if (!empty($keys['#negation'])) {
  233. return array();
  234. }
  235. $keywords = array();
  236. foreach ($keys as $i => $key) {
  237. if (!element_child($i)) {
  238. continue;
  239. }
  240. if (is_array($key)) {
  241. $keywords += $this->flattenKeysArray($key);
  242. }
  243. else {
  244. $keywords[$key] = $key;
  245. }
  246. }
  247. return $keywords;
  248. }
  249. /**
  250. * Returns snippets from a piece of text, with certain keywords highlighted.
  251. *
  252. * Largely copied from search_excerpt().
  253. *
  254. * @param string $text
  255. * The text to extract fragments from.
  256. * @param array $keys
  257. * Search keywords entered by the user.
  258. *
  259. * @return string
  260. * A string containing HTML for the excerpt.
  261. */
  262. protected function createExcerpt($text, array $keys) {
  263. // Prepare text by stripping HTML tags and decoding HTML entities.
  264. $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
  265. $text = ' ' . decode_entities($text);
  266. // Extract fragments around keywords.
  267. // First we collect ranges of text around each keyword, starting/ending
  268. // at spaces, trying to get to the requested length.
  269. // If the sum of all fragments is too short, we look for second occurrences.
  270. $ranges = array();
  271. $included = array();
  272. $length = 0;
  273. $workkeys = $keys;
  274. while ($length < $this->options['excerpt_length'] && count($workkeys)) {
  275. foreach ($workkeys as $k => $key) {
  276. if ($length >= $this->options['excerpt_length']) {
  277. break;
  278. }
  279. // Remember occurrence of key so we can skip over it if more occurrences
  280. // are desired.
  281. if (!isset($included[$key])) {
  282. $included[$key] = 0;
  283. }
  284. // Locate a keyword (position $p, always >0 because $text starts with a
  285. // space).
  286. $p = 0;
  287. if (preg_match('/' . self::$boundary . $key . self::$boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
  288. $p = $match[0][1];
  289. }
  290. // Now locate a space in front (position $q) and behind it (position $s),
  291. // leaving about 60 characters extra before and after for context.
  292. // Note that a space was added to the front and end of $text above.
  293. if ($p) {
  294. if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) {
  295. $end = substr($text . ' ', $p, 80);
  296. if (($s = strrpos($end, ' ')) !== FALSE) {
  297. // Account for the added spaces.
  298. $q = max($q - 1, 0);
  299. $s = min($s, strlen($end) - 1);
  300. $ranges[$q] = $p + $s;
  301. $length += $p + $s - $q;
  302. $included[$key] = $p + 1;
  303. }
  304. else {
  305. unset($workkeys[$k]);
  306. }
  307. }
  308. else {
  309. unset($workkeys[$k]);
  310. }
  311. }
  312. else {
  313. unset($workkeys[$k]);
  314. }
  315. }
  316. }
  317. if (count($ranges) == 0) {
  318. // We didn't find any keyword matches, so just return NULL.
  319. return NULL;
  320. }
  321. // Sort the text ranges by starting position.
  322. ksort($ranges);
  323. // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  324. $newranges = array();
  325. foreach ($ranges as $from2 => $to2) {
  326. if (!isset($from1)) {
  327. $from1 = $from2;
  328. $to1 = $to2;
  329. continue;
  330. }
  331. if ($from2 <= $to1) {
  332. $to1 = max($to1, $to2);
  333. }
  334. else {
  335. $newranges[$from1] = $to1;
  336. $from1 = $from2;
  337. $to1 = $to2;
  338. }
  339. }
  340. $newranges[$from1] = $to1;
  341. // Fetch text
  342. $out = array();
  343. foreach ($newranges as $from => $to) {
  344. $out[] = substr($text, $from, $to - $from);
  345. }
  346. // Let translators have the ... separator text as one chunk.
  347. $dots = explode('!excerpt', t('... !excerpt ... !excerpt ...'));
  348. $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2];
  349. $text = check_plain($text);
  350. return $this->highlightField($text, $keys);
  351. }
  352. /**
  353. * Marks occurrences of the search keywords in a text field.
  354. *
  355. * @param string $text
  356. * The text of the field.
  357. * @param array $keys
  358. * Search keywords entered by the user.
  359. *
  360. * @return string
  361. * The field's text with all occurrences of search keywords highlighted.
  362. */
  363. protected function highlightField($text, array $keys) {
  364. $replace = $this->options['prefix'] . '\0' . $this->options['suffix'];
  365. $keys = implode('|', array_map('preg_quote', $keys));
  366. $text = preg_replace('/' . self::$boundary . '(' . $keys . ')' . self::$boundary . '/iu', $replace, ' ' . $text . ' ');
  367. return substr($text, 1, -1);
  368. }
  369. }