options += array(
'title' => FALSE,
'alt' => TRUE,
'tags' => "h1 = 5\n" .
"h2 = 3\n" .
"h3 = 2\n" .
"strong = 2\n" .
"b = 2\n" .
"em = 1.5\n" .
'u = 1.5',
);
$this->tags = drupal_parse_info_format($this->options['tags']);
// Specifying empty tags doesn't make sense.
unset($this->tags['br'], $this->tags['hr']);
}
public function configurationForm() {
$form = parent::configurationForm();
$form += array(
'title' => array(
'#type' => 'checkbox',
'#title' => t('Index title attribute'),
'#description' => t('If set, the contents of title attributes will be indexed.'),
'#default_value' => $this->options['title'],
),
'alt' => array(
'#type' => 'checkbox',
'#title' => t('Index alt attribute'),
'#description' => t('If set, the alternative text of images will be indexed.'),
'#default_value' => $this->options['alt'],
),
'tags' => array(
'#type' => 'textarea',
'#title' => t('Tag boosts'),
'#description' => t('Specify special boost values for certain HTML elements, in INI file format. ' .
'The boost values of nested elements are multiplied, elements not mentioned will have the default boost value of 1. ' .
'Assign a boost of 0 to ignore the text content of that HTML element.',
array('@link' => url('http://api.drupal.org/api/function/drupal_parse_info_format/7'))),
'#default_value' => $this->options['tags'],
),
);
return $form;
}
public function configurationFormValidate(array $form, array &$values, array &$form_state) {
parent::configurationFormValidate($form, $values, $form_state);
if (empty($values['tags'])) {
return;
}
$tags = drupal_parse_info_format($values['tags']);
$errors = array();
foreach ($tags as $key => $value) {
if (is_array($value)) {
$errors[] = t("Boost value for tag <@tag> can't be an array.", array('@tag' => $key));
}
elseif (!is_numeric($value)) {
$errors[] = t("Boost value for tag <@tag> must be numeric.", array('@tag' => $key));
}
elseif ($value < 0) {
$errors[] = t('Boost value for tag <@tag> must be non-negative.', array('@tag' => $key));
}
}
if ($errors) {
form_error($form['tags'], implode("
\n", $errors));
}
}
protected function processFieldValue(&$value) {
$text = str_replace(array('<', '>'), array(' <', '> '), $value); // Let removed tags still delimit words.
if ($this->options['title']) {
$text = preg_replace('/(<[-a-z_]+[^>]+)\btitle\s*=\s*("([^"]+)"|\'([^\']+)\')([^>]*>)/i', '$1 $5 $3$4 ', $text);
}
if ($this->options['alt']) {
$text = preg_replace('/]+\balt\s*=\s*("([^"]+)"|\'([^\']+)\')[^>]*>/i', ' $2$3 ', $text);
}
if ($this->tags) {
$text = strip_tags($text, '<' . implode('><', array_keys($this->tags)) . '>');
$value = $this->parseText($text);
}
else {
$value = html_entity_decode(strip_tags($text));
// Remove any multiple or leading/trailing spaces we might have introduced.
$value = preg_replace('/\s\s+/', ' ', trim($value));
}
}
protected function parseText(&$text, $active_tag = NULL, $boost = 1) {
$ret = array();
while (($pos = strpos($text, '<')) !== FALSE) {
if ($boost && $pos > 0) {
$token = html_entity_decode(substr($text, 0, $pos), ENT_QUOTES, 'UTF-8');
// Remove any multiple or leading/trailing spaces we might have introduced.
$token = preg_replace('/\s\s+/', ' ', trim($token));
$ret[] = array(
'value' => $token,
'score' => $boost,
);
}
$text = substr($text, $pos + 1);
if (!preg_match('#^(/?)([:_a-zA-Z][-:_a-zA-Z0-9.]*)#', $text, $m)) {
continue;
}
$text = substr($text, strpos($text, '>') + 1);
if ($m[1]) {
// Closing tag.
if ($active_tag && $m[2] == $active_tag) {
return $ret;
}
}
else {
// Opening tag => recursive call.
$inner_boost = $boost * (isset($this->tags[$m[2]]) ? $this->tags[$m[2]] : 1);
$ret = array_merge($ret, $this->parseText($text, $m[2], $inner_boost));
}
}
if ($text) {
$token = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
// Remove any multiple or leading/trailing spaces we might have introduced.
$token = preg_replace('/\s\s+/', ' ', trim($token));
$ret[] = array(
'value' => $token,
'score' => $boost,
);
$text = '';
}
return $ret;
}
}