processor_highlight.inc 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. <?php
  2. /**
  3. * @file
  4. * Contains the SearchApiHighlight class.
  5. */
  6. /**
  7. * Processor for highlighting search results.
  8. */
  9. class SearchApiHighlight extends SearchApiAbstractProcessor {
  10. /**
  11. * PREG regular expression for a word boundary.
  12. *
  13. * We highlight around non-indexable or CJK characters.
  14. *
  15. * @var string
  16. */
  17. protected static $boundary;
  18. /**
  19. * PREG regular expression for splitting words.
  20. *
  21. * @var string
  22. */
  23. protected static $split;
  24. /**
  25. * {@inheritdoc}
  26. */
  27. public function __construct(SearchApiIndex $index, array $options = array()) {
  28. parent::__construct($index, $options);
  29. $cjk = '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' .
  30. '\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' .
  31. '\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' .
  32. '\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' .
  33. '\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}';
  34. self::$boundary = '(?:(?<=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . '])|(?=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . ']))';
  35. self::$split = '/[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . ']+/iu';
  36. }
  37. /**
  38. * {@inheritdoc}
  39. */
  40. public function configurationForm() {
  41. $this->options += array(
  42. 'prefix' => '<strong>',
  43. 'suffix' => '</strong>',
  44. 'excerpt' => TRUE,
  45. 'excerpt_length' => 256,
  46. 'highlight' => 'always',
  47. 'highlight_partial' => FALSE,
  48. 'exclude_fields' => array(),
  49. );
  50. $form['prefix'] = array(
  51. '#type' => 'textfield',
  52. '#title' => t('Highlighting prefix'),
  53. '#description' => t('Text/HTML that will be prepended to all occurrences of search keywords in highlighted text.'),
  54. '#default_value' => $this->options['prefix'],
  55. );
  56. $form['suffix'] = array(
  57. '#type' => 'textfield',
  58. '#title' => t('Highlighting suffix'),
  59. '#description' => t('Text/HTML that will be appended to all occurrences of search keywords in highlighted text.'),
  60. '#default_value' => $this->options['suffix'],
  61. );
  62. $form['excerpt'] = array(
  63. '#type' => 'checkbox',
  64. '#title' => t('Create excerpt'),
  65. '#description' => t('When enabled, an excerpt will be created for searches with keywords, containing all occurrences of keywords in a fulltext field.'),
  66. '#default_value' => $this->options['excerpt'],
  67. );
  68. $form['excerpt_length'] = array(
  69. '#type' => 'textfield',
  70. '#title' => t('Excerpt length'),
  71. '#description' => t('The requested length of the excerpt, in characters.'),
  72. '#default_value' => $this->options['excerpt_length'],
  73. '#element_validate' => array('element_validate_integer_positive'),
  74. '#states' => array(
  75. 'visible' => array(
  76. '#edit-processors-search-api-highlighting-settings-excerpt' => array(
  77. 'checked' => TRUE,
  78. ),
  79. ),
  80. ),
  81. );
  82. // Exclude certain fulltext fields.
  83. $fields = $this->index->getFields();
  84. $fulltext_fields = array();
  85. foreach ($this->index->getFulltextFields() as $field) {
  86. if (isset($fields[$field])) {
  87. $fulltext_fields[$field] = check_plain($fields[$field]['name'] . ' (' . $field . ')');
  88. }
  89. }
  90. $form['exclude_fields'] = array(
  91. '#type' => 'checkboxes',
  92. '#title' => t('Exclude fields from excerpt'),
  93. '#description' => t('Exclude certain fulltext fields from being displayed in the excerpt.'),
  94. '#options' => $fulltext_fields,
  95. '#default_value' => $this->options['exclude_fields'],
  96. '#attributes' => array('class' => array('search-api-checkboxes-list')),
  97. );
  98. $form['highlight'] = array(
  99. '#type' => 'select',
  100. '#title' => t('Highlight returned field data'),
  101. '#description' => t('Select whether returned fields should be highlighted.'),
  102. '#options' => array(
  103. 'always' => t('Always'),
  104. 'server' => t('If the server returns fields'),
  105. 'never' => t('Never'),
  106. ),
  107. '#default_value' => $this->options['highlight'],
  108. );
  109. $form['highlight_partial'] = array(
  110. '#type' => 'checkbox',
  111. '#title' => t('Highlight partial matches'),
  112. '#description' => t('When enabled, matches in parts of words will be highlighted as well.'),
  113. '#default_value' => $this->options['highlight_partial'],
  114. );
  115. return $form;
  116. }
  117. /**
  118. * {@inheritdoc}
  119. */
  120. public function configurationFormValidate(array $form, array &$values, array &$form_state) {
  121. $values['exclude_fields'] = array_filter($values['exclude_fields']);
  122. }
  123. /**
  124. * {@inheritdoc}
  125. */
  126. public function postprocessSearchResults(array &$response, SearchApiQuery $query) {
  127. if (empty($response['results']) || !($keys = $this->getKeywords($query))) {
  128. return;
  129. }
  130. $fulltext_fields = $this->index->getFulltextFields();
  131. if (!empty($this->options['exclude_fields'])) {
  132. $fulltext_fields = drupal_map_assoc($fulltext_fields);
  133. foreach ($this->options['exclude_fields'] as $field) {
  134. unset($fulltext_fields[$field]);
  135. }
  136. }
  137. foreach ($response['results'] as $id => &$result) {
  138. if ($this->options['excerpt']) {
  139. $text = array();
  140. $fields = $this->getFulltextFields($response['results'], $id, $fulltext_fields);
  141. foreach ($fields as $data) {
  142. if (is_array($data)) {
  143. $text = array_merge($text, $data);
  144. }
  145. else {
  146. $text[] = $data;
  147. }
  148. }
  149. $result['excerpt'] = $this->createExcerpt($this->flattenArrayValues($text), $keys);
  150. }
  151. if ($this->options['highlight'] != 'never') {
  152. $fields = $this->getFulltextFields($response['results'], $id, $fulltext_fields, $this->options['highlight'] == 'always');
  153. foreach ($fields as $field => $data) {
  154. $result['fields'][$field] = array('#sanitize_callback' => FALSE);
  155. if (is_array($data)) {
  156. foreach ($data as $i => $text) {
  157. $result['fields'][$field]['#value'][$i] = $this->highlightField($text, $keys);
  158. }
  159. }
  160. else {
  161. $result['fields'][$field]['#value'] = $this->highlightField($data, $keys);
  162. }
  163. }
  164. }
  165. }
  166. }
  167. /**
  168. * Retrieves the fulltext data of a result.
  169. *
  170. * @param array $results
  171. * All results returned in the search, by reference.
  172. * @param int|string $i
  173. * The index in the results array of the result whose data should be
  174. * returned.
  175. * @param array $fulltext_fields
  176. * The fulltext fields from which the excerpt should be created.
  177. * @param bool $load
  178. * TRUE if the item should be loaded if necessary, FALSE if only fields
  179. * already returned in the results should be used.
  180. *
  181. * @return array
  182. * An array containing fulltext field names mapped to the text data
  183. * contained in them for the given result.
  184. */
  185. protected function getFulltextFields(array &$results, $i, array $fulltext_fields, $load = TRUE) {
  186. global $language;
  187. $data = array();
  188. $result = &$results[$i];
  189. // Act as if $load is TRUE if we have a loaded item.
  190. $load |= !empty($result['entity']);
  191. $result += array('fields' => array());
  192. // We only need detailed fields data if $load is TRUE.
  193. $fields = $load ? $this->index->getFields() : array();
  194. $needs_extraction = array();
  195. $returned_fields = search_api_get_sanitized_field_values(array_intersect_key($result['fields'], array_flip($fulltext_fields)));
  196. foreach ($fulltext_fields as $field) {
  197. if (array_key_exists($field, $returned_fields)) {
  198. $data[$field] = $returned_fields[$field];
  199. }
  200. elseif ($load) {
  201. $needs_extraction[$field] = $fields[$field];
  202. }
  203. }
  204. if (!$needs_extraction) {
  205. return $data;
  206. }
  207. if (empty($result['entity'])) {
  208. $items = $this->index->loadItems(array_keys($results));
  209. foreach ($items as $id => $item) {
  210. $results[$id]['entity'] = $item;
  211. }
  212. }
  213. // If we still don't have a loaded item, we should stop trying.
  214. if (empty($result['entity'])) {
  215. return $data;
  216. }
  217. $wrapper = $this->index->entityWrapper($result['entity'], FALSE);
  218. $wrapper->language($language->language);
  219. $extracted = search_api_extract_fields($wrapper, $needs_extraction, array('sanitize' => TRUE));
  220. foreach ($extracted as $field => $info) {
  221. if (isset($info['value'])) {
  222. $data[$field] = $info['value'];
  223. }
  224. }
  225. return $data;
  226. }
  227. /**
  228. * Extracts the positive keywords used in a search query.
  229. *
  230. * @param SearchApiQuery $query
  231. * The query from which to extract the keywords.
  232. *
  233. * @return array
  234. * An array of all unique positive keywords used in the query.
  235. */
  236. protected function getKeywords(SearchApiQuery $query) {
  237. $keys = $query->getKeys();
  238. if (!$keys) {
  239. return array();
  240. }
  241. if (is_array($keys)) {
  242. return $this->flattenKeysArray($keys);
  243. }
  244. $keywords = preg_split(self::$split, $keys);
  245. // Assure there are no duplicates. (This is actually faster than
  246. // array_unique() by a factor of 3 to 4.)
  247. $keywords = drupal_map_assoc(array_filter($keywords));
  248. // Remove quotes from keywords.
  249. foreach ($keywords as $key) {
  250. $keywords[$key] = trim($key, "'\"");
  251. }
  252. return drupal_map_assoc(array_filter($keywords));
  253. }
  254. /**
  255. * Extracts the positive keywords from a keys array.
  256. *
  257. * @param array $keys
  258. * A search keys array, as specified by SearchApiQueryInterface::getKeys().
  259. *
  260. * @return array
  261. * An array of all unique positive keywords contained in the keys.
  262. */
  263. protected function flattenKeysArray(array $keys) {
  264. if (!empty($keys['#negation'])) {
  265. return array();
  266. }
  267. $keywords = array();
  268. foreach ($keys as $i => $key) {
  269. if (!element_child($i)) {
  270. continue;
  271. }
  272. if (is_array($key)) {
  273. $keywords += $this->flattenKeysArray($key);
  274. }
  275. else {
  276. $keywords[$key] = $key;
  277. }
  278. }
  279. return $keywords;
  280. }
  281. /**
  282. * Returns snippets from a piece of text, with certain keywords highlighted.
  283. *
  284. * Largely copied from search_excerpt().
  285. *
  286. * @param string $text
  287. * The text to extract fragments from.
  288. * @param array $keys
  289. * Search keywords entered by the user.
  290. *
  291. * @return string
  292. * A string containing HTML for the excerpt.
  293. */
  294. protected function createExcerpt($text, array $keys) {
  295. // Prepare text by stripping HTML tags and decoding HTML entities.
  296. $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
  297. $text = ' ' . decode_entities($text);
  298. // Extract fragments around keywords.
  299. // First we collect ranges of text around each keyword, starting/ending
  300. // at spaces, trying to get to the requested length.
  301. // If the sum of all fragments is too short, we look for second occurrences.
  302. $ranges = array();
  303. $included = array();
  304. $length = 0;
  305. $work_keys = $keys;
  306. while ($length < $this->options['excerpt_length'] && $work_keys) {
  307. foreach ($work_keys as $k => $key) {
  308. if ($length >= $this->options['excerpt_length']) {
  309. break;
  310. }
  311. // Remember occurrence of key so we can skip over it if more occurrences
  312. // are desired.
  313. if (!isset($included[$key])) {
  314. $included[$key] = 0;
  315. }
  316. // Locate a keyword (position $p, always >0 because $text starts with a
  317. // space).
  318. $p = 0;
  319. if (empty($this->options['highlight_partial'])) {
  320. $regex = '/' . self::$boundary . preg_quote($key, '/') . self::$boundary . '/iu';
  321. if (preg_match($regex, $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) {
  322. $p = $match[0][1];
  323. }
  324. }
  325. else {
  326. $p = stripos($text, $key, $included[$key]);
  327. }
  328. // Now locate a space in front (position $q) and behind it (position $s),
  329. // leaving about 60 characters extra before and after for context.
  330. // Note that a space was added to the front and end of $text above.
  331. if ($p) {
  332. if (($q = strpos(' ' . $text, ' ', max(0, $p - 61))) !== FALSE) {
  333. $end = substr($text . ' ', $p, 80);
  334. if (($s = strrpos($end, ' ')) !== FALSE) {
  335. // Account for the added spaces.
  336. $q = max($q - 1, 0);
  337. $s = min($s, strlen($end) - 1);
  338. $ranges[$q] = $p + $s;
  339. $length += $p + $s - $q;
  340. $included[$key] = $p + 1;
  341. continue;
  342. }
  343. }
  344. }
  345. // Unless we got a match above, we don't need to look for this key any
  346. // more.
  347. unset($work_keys[$k]);
  348. }
  349. }
  350. if (count($ranges) == 0) {
  351. // We didn't find any keyword matches, so just return NULL.
  352. return NULL;
  353. }
  354. // Sort the text ranges by starting position.
  355. ksort($ranges);
  356. // Now we collapse overlapping text ranges into one. The sorting makes it O(n).
  357. $newranges = array();
  358. foreach ($ranges as $from2 => $to2) {
  359. if (!isset($from1)) {
  360. $from1 = $from2;
  361. $to1 = $to2;
  362. continue;
  363. }
  364. if ($from2 <= $to1) {
  365. $to1 = max($to1, $to2);
  366. }
  367. else {
  368. $newranges[$from1] = $to1;
  369. $from1 = $from2;
  370. $to1 = $to2;
  371. }
  372. }
  373. $newranges[$from1] = $to1;
  374. // Fetch text
  375. $out = array();
  376. foreach ($newranges as $from => $to) {
  377. $out[] = substr($text, $from, $to - $from);
  378. }
  379. // Let translators have the ... separator text as one chunk.
  380. $dots = explode('!excerpt', t('... !excerpt ... !excerpt ...'));
  381. $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2];
  382. $text = check_plain($text);
  383. // Since we stripped the tags at the beginning, highlighting doesn't need to
  384. // handle HTML anymore.
  385. return $this->highlightField($text, $keys, FALSE);
  386. }
  387. /**
  388. * Marks occurrences of the search keywords in a text field.
  389. *
  390. * @param string $text
  391. * The text of the field.
  392. * @param array $keys
  393. * Search keywords entered by the user.
  394. * @param bool $html
  395. * Whether the text can contain HTML tags or not. In the former case, text
  396. * inside tags (i.e., tag names and attributes) won't be highlighted.
  397. *
  398. * @return string
  399. * The field's text with all occurrences of search keywords highlighted.
  400. */
  401. protected function highlightField($text, array $keys, $html = TRUE) {
  402. if (is_array($text)) {
  403. $text = $this->flattenArrayValues($text);
  404. }
  405. if ($html) {
  406. $texts = preg_split('#((?:</?[[:alpha:]](?:[^>"\']*|"[^"]*"|\'[^\']\')*>)+)#i', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
  407. for ($i = 0; $i < count($texts); $i += 2) {
  408. $texts[$i] = $this->highlightField($texts[$i], $keys, FALSE);
  409. }
  410. return implode('', $texts);
  411. }
  412. $keys = implode('|', array_map('preg_quote', $keys, array_fill(0, count($keys), '/')));
  413. // If "Highlight partial matches" is disabled, we only want to highlight
  414. // matches that are complete words. Otherwise, we want all of them.
  415. $boundary = empty($this->options['highlight_partial']) ? self::$boundary : '';
  416. $regex = '/' . $boundary . '(?:' . $keys . ')' . $boundary . '/iu';
  417. $replace = $this->options['prefix'] . '\0' . $this->options['suffix'];
  418. $text = preg_replace($regex, $replace, ' ' . $text . ' ');
  419. return substr($text, 1, -1);
  420. }
  421. /**
  422. * Flattens a (possibly multidimensional) array into a string.
  423. *
  424. * @param array $array
  425. * The array to flatten.
  426. * @param string $glue
  427. * (optional) The separator to insert between individual array items.
  428. *
  429. * @return string
  430. * The glued string.
  431. */
  432. protected function flattenArrayValues(array $array, $glue = " \n\n ") {
  433. $ret = array();
  434. foreach ($array as $item) {
  435. if (is_array($item)) {
  436. $ret[] = $this->flattenArrayValues($item, $glue);
  437. }
  438. else {
  439. $ret[] = $item;
  440. }
  441. }
  442. return implode($glue, $ret);
  443. }
  444. }