123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527 |
- <?php
- /**
- * @file
- * Contains the SearchApiHighlight class.
- */
- /**
- * Processor for highlighting search results.
- */
- class SearchApiHighlight extends SearchApiAbstractProcessor {
- /**
- * PREG regular expression for a word boundary.
- *
- * We highlight around non-indexable or CJK characters.
- *
- * @var string
- */
- protected static $boundary;
- /**
- * PREG regular expression for splitting words.
- *
- * @var string
- */
- protected static $split;
- /**
- * {@inheritdoc}
- */
- public function __construct(SearchApiIndex $index, array $options = array()) {
- parent::__construct($index, $options);
- $cjk = '\x{1100}-\x{11FF}\x{3040}-\x{309F}\x{30A1}-\x{318E}' .
- '\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DBF}\x{4E00}-\x{9FCF}' .
- '\x{A000}-\x{A48F}\x{A4D0}-\x{A4FD}\x{A960}-\x{A97F}\x{AC00}-\x{D7FF}' .
- '\x{F900}-\x{FAFF}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{FF66}-\x{FFDC}' .
- '\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}';
- self::$boundary = '(?:(?<=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . '])|(?=[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . $cjk . ']))';
- self::$split = '/[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . ']+/iu';
- }
- /**
- * {@inheritdoc}
- */
- public function configurationForm() {
- $this->options += array(
- 'prefix' => '<strong>',
- 'suffix' => '</strong>',
- 'excerpt' => TRUE,
- 'excerpt_length' => 256,
- 'highlight' => 'always',
- 'highlight_partial' => FALSE,
- 'exclude_fields' => array(),
- );
- $form['prefix'] = array(
- '#type' => 'textfield',
- '#title' => t('Highlighting prefix'),
- '#description' => t('Text/HTML that will be prepended to all occurrences of search keywords in highlighted text.'),
- '#default_value' => $this->options['prefix'],
- );
- $form['suffix'] = array(
- '#type' => 'textfield',
- '#title' => t('Highlighting suffix'),
- '#description' => t('Text/HTML that will be appended to all occurrences of search keywords in highlighted text.'),
- '#default_value' => $this->options['suffix'],
- );
- $form['excerpt'] = array(
- '#type' => 'checkbox',
- '#title' => t('Create excerpt'),
- '#description' => t('When enabled, an excerpt will be created for searches with keywords, containing all occurrences of keywords in a fulltext field.'),
- '#default_value' => $this->options['excerpt'],
- );
- $form['excerpt_length'] = array(
- '#type' => 'textfield',
- '#title' => t('Excerpt length'),
- '#description' => t('The requested length of the excerpt, in characters.'),
- '#default_value' => $this->options['excerpt_length'],
- '#element_validate' => array('element_validate_integer_positive'),
- '#states' => array(
- 'visible' => array(
- '#edit-processors-search-api-highlighting-settings-excerpt' => array(
- 'checked' => TRUE,
- ),
- ),
- ),
- );
- // Exclude certain fulltext fields.
- $fields = $this->index->getFields();
- $fulltext_fields = array();
- foreach ($this->index->getFulltextFields() as $field) {
- if (isset($fields[$field])) {
- $fulltext_fields[$field] = check_plain($fields[$field]['name'] . ' (' . $field . ')');
- }
- }
- $form['exclude_fields'] = array(
- '#type' => 'checkboxes',
- '#title' => t('Exclude fields from excerpt'),
- '#description' => t('Exclude certain fulltext fields from being displayed in the excerpt.'),
- '#options' => $fulltext_fields,
- '#default_value' => $this->options['exclude_fields'],
- '#attributes' => array('class' => array('search-api-checkboxes-list')),
- );
- $form['highlight'] = array(
- '#type' => 'select',
- '#title' => t('Highlight returned field data'),
- '#description' => t('Select whether returned fields should be highlighted.'),
- '#options' => array(
- 'always' => t('Always'),
- 'server' => t('If the server returns fields'),
- 'never' => t('Never'),
- ),
- '#default_value' => $this->options['highlight'],
- );
- $form['highlight_partial'] = array(
- '#type' => 'checkbox',
- '#title' => t('Highlight partial matches'),
- '#description' => t('When enabled, matches in parts of words will be highlighted as well.'),
- '#default_value' => $this->options['highlight_partial'],
- );
- return $form;
- }
- /**
- * {@inheritdoc}
- */
- public function configurationFormValidate(array $form, array &$values, array &$form_state) {
- $values['exclude_fields'] = array_filter($values['exclude_fields']);
- }
- /**
- * {@inheritdoc}
- */
- public function postprocessSearchResults(array &$response, SearchApiQuery $query) {
- if (empty($response['results']) || !($keys = $this->getKeywords($query))) {
- return;
- }
- $fulltext_fields = $this->index->getFulltextFields();
- if (!empty($this->options['exclude_fields'])) {
- $fulltext_fields = drupal_map_assoc($fulltext_fields);
- foreach ($this->options['exclude_fields'] as $field) {
- unset($fulltext_fields[$field]);
- }
- }
- foreach ($response['results'] as $id => &$result) {
- if ($this->options['excerpt']) {
- $text = array();
- $fields = $this->getFulltextFields($response['results'], $id, $fulltext_fields);
- foreach ($fields as $data) {
- if (is_array($data)) {
- $text = array_merge($text, $data);
- }
- else {
- $text[] = $data;
- }
- }
- $result['excerpt'] = $this->createExcerpt($this->flattenArrayValues($text), $keys);
- }
- if ($this->options['highlight'] != 'never') {
- $fields = $this->getFulltextFields($response['results'], $id, $fulltext_fields, $this->options['highlight'] == 'always');
- foreach ($fields as $field => $data) {
- $result['fields'][$field] = array('#sanitize_callback' => FALSE);
- if (is_array($data)) {
- foreach ($data as $i => $text) {
- $result['fields'][$field]['#value'][$i] = $this->highlightField($text, $keys);
- }
- }
- else {
- $result['fields'][$field]['#value'] = $this->highlightField($data, $keys);
- }
- }
- }
- }
- }
- /**
- * Retrieves the fulltext data of a result.
- *
- * @param array $results
- * All results returned in the search, by reference.
- * @param int|string $i
- * The index in the results array of the result whose data should be
- * returned.
- * @param array $fulltext_fields
- * The fulltext fields from which the excerpt should be created.
- * @param bool $load
- * TRUE if the item should be loaded if necessary, FALSE if only fields
- * already returned in the results should be used.
- *
- * @return array
- * An array containing fulltext field names mapped to the text data
- * contained in them for the given result.
- */
- protected function getFulltextFields(array &$results, $i, array $fulltext_fields, $load = TRUE) {
- global $language;
- $data = array();
- $result = &$results[$i];
- // Act as if $load is TRUE if we have a loaded item.
- $load |= !empty($result['entity']);
- $result += array('fields' => array());
- // We only need detailed fields data if $load is TRUE.
- $fields = $load ? $this->index->getFields() : array();
- $needs_extraction = array();
- $returned_fields = search_api_get_sanitized_field_values(array_intersect_key($result['fields'], array_flip($fulltext_fields)));
- foreach ($fulltext_fields as $field) {
- if (array_key_exists($field, $returned_fields)) {
- $data[$field] = $returned_fields[$field];
- }
- elseif ($load) {
- $needs_extraction[$field] = $fields[$field];
- }
- }
- if (!$needs_extraction) {
- return $data;
- }
- if (empty($result['entity'])) {
- $items = $this->index->loadItems(array_keys($results));
- foreach ($items as $id => $item) {
- $results[$id]['entity'] = $item;
- }
- }
- // If we still don't have a loaded item, we should stop trying.
- if (empty($result['entity'])) {
- return $data;
- }
- $wrapper = $this->index->entityWrapper($result['entity'], FALSE);
- $wrapper->language($language->language);
- $extracted = search_api_extract_fields($wrapper, $needs_extraction, array('sanitize' => TRUE));
- foreach ($extracted as $field => $info) {
- if (isset($info['value'])) {
- $data[$field] = $info['value'];
- }
- }
- return $data;
- }
- /**
- * Extracts the positive keywords used in a search query.
- *
- * @param SearchApiQuery $query
- * The query from which to extract the keywords.
- *
- * @return array
- * An array of all unique positive keywords used in the query.
- */
- protected function getKeywords(SearchApiQuery $query) {
- $keys = $query->getKeys();
- if (!$keys) {
- return array();
- }
- if (is_array($keys)) {
- return $this->flattenKeysArray($keys);
- }
- $keywords = preg_split(self::$split, $keys);
- // Assure there are no duplicates. (This is actually faster than
- // array_unique() by a factor of 3 to 4.)
- $keywords = drupal_map_assoc(array_filter($keywords));
- // Remove quotes from keywords.
- foreach ($keywords as $key) {
- $keywords[$key] = trim($key, "'\"");
- }
- return drupal_map_assoc(array_filter($keywords));
- }
- /**
- * Extracts the positive keywords from a keys array.
- *
- * @param array $keys
- * A search keys array, as specified by SearchApiQueryInterface::getKeys().
- *
- * @return array
- * An array of all unique positive keywords contained in the keys.
- */
- protected function flattenKeysArray(array $keys) {
- if (!empty($keys['#negation'])) {
- return array();
- }
- $keywords = array();
- foreach ($keys as $i => $key) {
- if (!element_child($i)) {
- continue;
- }
- if (is_array($key)) {
- $keywords += $this->flattenKeysArray($key);
- }
- else {
- $keywords[$key] = $key;
- }
- }
- return $keywords;
- }
- /**
- * Returns snippets from a piece of text, with certain keywords highlighted.
- *
- * Largely copied from search_excerpt().
- *
- * @param string $text
- * The text to extract fragments from.
- * @param array $keys
- * Search keywords entered by the user.
- *
- * @return string|null
- * A string containing HTML for the excerpt, or NULL if none could be
- * created.
- */
- protected function createExcerpt($text, array $keys) {
- // Prepare text by stripping HTML tags and decoding HTML entities.
- $text = strip_tags(str_replace(array('<', '>'), array(' <', '> '), $text));
- $text = decode_entities($text);
- $text = preg_replace('/\s+/', ' ', $text);
- $text = trim($text, ' ');
- $text_length = strlen($text);
- // Try to reach the requested excerpt length with about two fragments (each
- // with a keyword and some context).
- $ranges = array();
- $length = 0;
- $look_start = array();
- $remaining_keys = $keys;
- // Get the set excerpt length from the configuration. If the length is too
- // small, only use one fragment.
- $excerpt_length = $this->options['excerpt_length'];
- $context_length = round($excerpt_length / 4) - 3;
- if ($context_length < 32) {
- $context_length = round($excerpt_length / 2) - 1;
- }
- while ($length < $excerpt_length && !empty($remaining_keys)) {
- $found_keys = array();
- foreach ($remaining_keys as $key) {
- if ($length >= $excerpt_length) {
- break;
- }
- // Remember where we last found $key, in case we are coming through a
- // second time.
- if (!isset($look_start[$key])) {
- $look_start[$key] = 0;
- }
- // See if we can find $key after where we found it the last time. Since
- // we are requiring a match on a word boundary, make sure $text starts
- // and ends with a space.
- $matches = array();
- if (!$this->options['highlight_partial']) {
- $found_position = FALSE;
- $regex = '/' . static::$boundary . preg_quote($key, '/') . static::$boundary . '/iu';
- if (preg_match($regex, ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
- $found_position = $matches[0][1];
- }
- }
- else {
- $found_position = stripos($text, $key, $look_start[$key]);
- }
- if ($found_position !== FALSE) {
- $look_start[$key] = $found_position + 1;
- // Keep track of which keys we found this time, in case we need to
- // pass through again to find more text.
- $found_keys[] = $key;
- // Locate a space before and after this match, leaving some context on
- // each end.
- if ($found_position > $context_length) {
- $before = strpos($text, ' ', $found_position - $context_length);
- if ($before !== FALSE) {
- ++$before;
- }
- }
- else {
- $before = 0;
- }
- if ($before !== FALSE && $before <= $found_position) {
- if ($text_length > $found_position + $context_length) {
- $after = strrpos(substr($text, 0, $found_position + $context_length), ' ', $found_position);
- }
- else {
- $after = $text_length;
- }
- if ($after !== FALSE && $after > $found_position) {
- if ($before < $after) {
- // Save this range.
- $ranges[$before] = $after;
- $length += $after - $before;
- }
- }
- }
- }
- }
- // Next time through this loop, only look for keys we found this time,
- // if any.
- $remaining_keys = $found_keys;
- }
- if (!$ranges) {
- // We didn't find any keyword matches, return NULL.
- return NULL;
- }
- // Sort the text ranges by starting position.
- ksort($ranges);
- // Collapse overlapping text ranges into one. The sorting makes it O(n).
- $newranges = array();
- $from1 = $to1 = NULL;
- foreach ($ranges as $from2 => $to2) {
- if ($from1 === NULL) {
- // This is the first time through this loop: initialize.
- $from1 = $from2;
- $to1 = $to2;
- continue;
- }
- if ($from2 <= $to1) {
- // The ranges overlap: combine them.
- $to1 = max($to1, $to2);
- }
- else {
- // The ranges do not overlap: save the working range and start a new
- // one.
- $newranges[$from1] = $to1;
- $from1 = $from2;
- $to1 = $to2;
- }
- }
- // Save the remaining working range.
- $newranges[$from1] = $to1;
- // Fetch text within the combined ranges we found.
- $out = array();
- foreach ($newranges as $from => $to) {
- $out[] = substr($text, $from, $to - $from);
- }
- if (!$out) {
- return NULL;
- }
- // Let translators have the ... separator text as one chunk.
- $dots = explode('!excerpt', t('... !excerpt ... !excerpt ...'));
- $text = (isset($newranges[0]) ? '' : $dots[0]) . implode($dots[1], $out) . $dots[2];
- $text = check_plain($text);
- // Since we stripped the tags at the beginning, highlighting doesn't need to
- // handle HTML anymore.
- return $this->highlightField($text, $keys, FALSE);
- }
- /**
- * Marks occurrences of the search keywords in a text field.
- *
- * @param string $text
- * The text of the field.
- * @param array $keys
- * Search keywords entered by the user.
- * @param bool $html
- * Whether the text can contain HTML tags or not. In the former case, text
- * inside tags (i.e., tag names and attributes) won't be highlighted.
- *
- * @return string
- * The field's text with all occurrences of search keywords highlighted.
- */
- protected function highlightField($text, array $keys, $html = TRUE) {
- if (is_array($text)) {
- $text = $this->flattenArrayValues($text);
- }
- if ($html) {
- $texts = preg_split('#((?:</?[[:alpha:]](?:[^>"\']*|"[^"]*"|\'[^\']\')*>)+)#i', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
- for ($i = 0; $i < count($texts); $i += 2) {
- $texts[$i] = $this->highlightField($texts[$i], $keys, FALSE);
- }
- return implode('', $texts);
- }
- $keys = implode('|', array_map('preg_quote', $keys, array_fill(0, count($keys), '/')));
- // If "Highlight partial matches" is disabled, we only want to highlight
- // matches that are complete words. Otherwise, we want all of them.
- $boundary = empty($this->options['highlight_partial']) ? self::$boundary : '';
- $regex = '/' . $boundary . '(?:' . $keys . ')' . $boundary . '/iu';
- $replace = $this->options['prefix'] . '\0' . $this->options['suffix'];
- $text = preg_replace($regex, $replace, ' ' . $text . ' ');
- return substr($text, 1, -1);
- }
- /**
- * Flattens a (possibly multidimensional) array into a string.
- *
- * @param array $array
- * The array to flatten.
- * @param string $glue
- * (optional) The separator to insert between individual array items.
- *
- * @return string
- * The glued string.
- */
- protected function flattenArrayValues(array $array, $glue = " \n\n ") {
- $ret = array();
- foreach ($array as $item) {
- if (is_array($item)) {
- $ret[] = $this->flattenArrayValues($item, $glue);
- }
- else {
- $ret[] = $item;
- }
- }
- return implode($glue, $ret);
- }
- }
|