processor.inc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. <?php
  2. /**
  3. * Interface representing a Search API pre- and/or post-processor.
  4. *
  5. * While processors are enabled or disabled for both pre- and postprocessing at
  6. * once, many processors will only need to run in one of those two phases. Then,
  7. * the other method(s) should simply be left blank. A processor should make it
  8. * clear in its description or documentation when it will run and what effect it
  9. * will have.
  10. * Usually, processors preprocessing indexed items will likewise preprocess
  11. * search queries, so these two methods should mostly be implemented either both
  12. * or neither.
  13. */
  14. interface SearchApiProcessorInterface {
  15. /**
  16. * Construct a processor.
  17. *
  18. * @param SearchApiIndex $index
  19. * The index for which processing is done.
  20. * @param array $options
  21. * The processor options set for this index.
  22. */
  23. public function __construct(SearchApiIndex $index, array $options = array());
  24. /**
  25. * Check whether this processor is applicable for a certain index.
  26. *
  27. * This can be used for hiding the processor on the index's "Workflow" tab. To
  28. * avoid confusion, you should only use criteria that are immutable, such as
  29. * the index's item type. Also, since this is only used for UI purposes, you
  30. * should not completely rely on this to ensure certain index configurations
  31. * and at least throw an exception with a descriptive error message if this is
  32. * violated on runtime.
  33. *
  34. * @param SearchApiIndex $index
  35. * The index to check for.
  36. *
  37. * @return boolean
  38. * TRUE if the processor can run on the given index; FALSE otherwise.
  39. */
  40. public function supportsIndex(SearchApiIndex $index);
  41. /**
  42. * Display a form for configuring this processor.
  43. * Since forcing users to specify options for disabled processors makes no
  44. * sense, none of the form elements should have the '#required' attribute set.
  45. *
  46. * @return array
  47. * A form array for configuring this processor, or FALSE if no configuration
  48. * is possible.
  49. */
  50. public function configurationForm();
  51. /**
  52. * Validation callback for the form returned by configurationForm().
  53. *
  54. * @param array $form
  55. * The form returned by configurationForm().
  56. * @param array $values
  57. * The part of the $form_state['values'] array corresponding to this form.
  58. * @param array $form_state
  59. * The complete form state.
  60. */
  61. public function configurationFormValidate(array $form, array &$values, array &$form_state);
  62. /**
  63. * Submit callback for the form returned by configurationForm().
  64. *
  65. * This method should both return the new options and set them internally.
  66. *
  67. * @param array $form
  68. * The form returned by configurationForm().
  69. * @param array $values
  70. * The part of the $form_state['values'] array corresponding to this form.
  71. * @param array $form_state
  72. * The complete form state.
  73. *
  74. * @return array
  75. * The new options array for this callback.
  76. */
  77. public function configurationFormSubmit(array $form, array &$values, array &$form_state);
  78. /**
  79. * Preprocess data items for indexing.
  80. *
  81. * Typically, a preprocessor will execute its preprocessing (e.g. stemming,
  82. * n-grams, word splitting, stripping stop words, etc.) only on the items'
  83. * search_api_fulltext fields, if set. Other fields should usually be left
  84. * untouched.
  85. *
  86. * @param array $items
  87. * An array of items to be preprocessed for indexing, formatted as specified
  88. * by SearchApiServiceInterface::indexItems().
  89. */
  90. public function preprocessIndexItems(array &$items);
  91. /**
  92. * Preprocess a search query.
  93. *
  94. * The same applies as when preprocessing indexed items: typically, only the
  95. * fulltext search keys should be processed, queries on specific fields should
  96. * usually not be altered.
  97. *
  98. * @param SearchApiQuery $query
  99. * The object representing the query to be executed.
  100. */
  101. public function preprocessSearchQuery(SearchApiQuery $query);
  102. /**
  103. * Postprocess search results before display.
  104. *
  105. * If a class is used for both pre- and post-processing a search query, the
  106. * same object will be used for both calls (so preserving some data or state
  107. * locally is possible).
  108. *
  109. * @param array $response
  110. * An array containing the search results. See the return value of
  111. * SearchApiQueryInterface->execute() for the detailed format.
  112. * @param SearchApiQuery $query
  113. * The object representing the executed query.
  114. */
  115. public function postprocessSearchResults(array &$response, SearchApiQuery $query);
  116. }
  117. /**
  118. * Abstract processor implementation that provides an easy framework for only
  119. * processing specific fields.
  120. *
  121. * Simple processors can just override process(), while others might want to
  122. * override the other process*() methods, and test*() (for restricting
  123. * processing to something other than all fulltext data).
  124. */
  125. abstract class SearchApiAbstractProcessor implements SearchApiProcessorInterface {
  126. /**
  127. * @var SearchApiIndex
  128. */
  129. protected $index;
  130. /**
  131. * @var array
  132. */
  133. protected $options;
  134. /**
  135. * Constructor, saving its arguments into properties.
  136. */
  137. public function __construct(SearchApiIndex $index, array $options = array()) {
  138. $this->index = $index;
  139. $this->options = $options;
  140. }
  141. public function supportsIndex(SearchApiIndex $index) {
  142. return TRUE;
  143. }
  144. public function configurationForm() {
  145. $form['#attached']['css'][] = drupal_get_path('module', 'search_api') . '/search_api.admin.css';
  146. $fields = $this->index->getFields();
  147. $field_options = array();
  148. $default_fields = array();
  149. if (isset($this->options['fields'])) {
  150. $default_fields = drupal_map_assoc(array_keys($this->options['fields']));
  151. }
  152. foreach ($fields as $name => $field) {
  153. $field_options[$name] = $field['name'];
  154. if (!empty($default_fields[$name]) || (!isset($this->options['fields']) && $this->testField($name, $field))) {
  155. $default_fields[$name] = $name;
  156. }
  157. }
  158. $form['fields'] = array(
  159. '#type' => 'checkboxes',
  160. '#title' => t('Fields to run on'),
  161. '#options' => $field_options,
  162. '#default_value' => $default_fields,
  163. '#attributes' => array('class' => array('search-api-checkboxes-list')),
  164. );
  165. return $form;
  166. }
  167. public function configurationFormValidate(array $form, array &$values, array &$form_state) {
  168. $fields = array_filter($values['fields']);
  169. if ($fields) {
  170. $fields = array_fill_keys($fields, TRUE);
  171. }
  172. $values['fields'] = $fields;
  173. }
  174. public function configurationFormSubmit(array $form, array &$values, array &$form_state) {
  175. $this->options = $values;
  176. return $values;
  177. }
  178. /**
  179. * Calls processField() for all appropriate fields.
  180. */
  181. public function preprocessIndexItems(array &$items) {
  182. foreach ($items as &$item) {
  183. foreach ($item as $name => &$field) {
  184. if ($this->testField($name, $field)) {
  185. $this->processField($field['value'], $field['type']);
  186. }
  187. }
  188. }
  189. }
  190. /**
  191. * Calls processKeys() for the keys and processFilters() for the filters.
  192. */
  193. public function preprocessSearchQuery(SearchApiQuery $query) {
  194. $keys = &$query->getKeys();
  195. $this->processKeys($keys);
  196. $filter = $query->getFilter();
  197. $filters = &$filter->getFilters();
  198. $this->processFilters($filters);
  199. }
  200. /**
  201. * Does nothing.
  202. */
  203. public function postprocessSearchResults(array &$response, SearchApiQuery $query) {
  204. return;
  205. }
  206. /**
  207. * Method for preprocessing field data.
  208. *
  209. * Calls process() either for the whole text, or each token, depending on the
  210. * type. Also takes care of extracting list values and of fusing returned
  211. * tokens back into a one-dimensional array.
  212. */
  213. protected function processField(&$value, &$type) {
  214. if (!isset($value) || $value === '') {
  215. return;
  216. }
  217. if (substr($type, 0, 5) == 'list<') {
  218. $inner_type = $t = $t1 = substr($type, 5, -1);
  219. foreach ($value as &$v) {
  220. $t1 = $inner_type;
  221. $this->processField($v, $t1);
  222. // If one value got tokenized, all others have to follow.
  223. if ($t1 != $inner_type) {
  224. $t = $t1;
  225. }
  226. }
  227. if ($t == 'tokens') {
  228. foreach ($value as $i => &$v) {
  229. if (!$v) {
  230. unset($value[$i]);
  231. continue;
  232. }
  233. if (!is_array($v)) {
  234. $v = array(array('value' => $v, 'score' => 1));
  235. }
  236. }
  237. }
  238. $type = "list<$t>";
  239. return;
  240. }
  241. if ($type == 'tokens') {
  242. foreach ($value as &$token) {
  243. $this->processFieldValue($token['value']);
  244. }
  245. }
  246. else {
  247. $this->processFieldValue($value);
  248. }
  249. if (is_array($value)) {
  250. // Don't tokenize non-fulltext content!
  251. if (in_array($type, array('text', 'tokens'))) {
  252. $type = 'tokens';
  253. $value = $this->normalizeTokens($value);
  254. }
  255. else {
  256. $value = $this->implodeTokens($value);
  257. }
  258. }
  259. }
  260. /**
  261. * Internal helper function for normalizing tokens.
  262. */
  263. protected function normalizeTokens($tokens, $score = 1) {
  264. $ret = array();
  265. foreach ($tokens as $token) {
  266. if (empty($token['value']) && !is_numeric($token['value'])) {
  267. // Filter out empty tokens.
  268. continue;
  269. }
  270. if (!isset($token['score'])) {
  271. $token['score'] = $score;
  272. }
  273. else {
  274. $token['score'] *= $score;
  275. }
  276. if (is_array($token['value'])) {
  277. foreach ($this->normalizeTokens($token['value'], $token['score']) as $t) {
  278. $ret[] = $t;
  279. }
  280. }
  281. else {
  282. $ret[] = $token;
  283. }
  284. }
  285. return $ret;
  286. }
  287. /**
  288. * Internal helper function for imploding tokens into a single string.
  289. *
  290. * @param array $tokens
  291. * The tokens array to implode.
  292. *
  293. * @return string
  294. * The text data from the tokens concatenated into a single string.
  295. */
  296. protected function implodeTokens(array $tokens) {
  297. $ret = array();
  298. foreach ($tokens as $token) {
  299. if (empty($token['value']) && !is_numeric($token['value'])) {
  300. // Filter out empty tokens.
  301. continue;
  302. }
  303. if (is_array($token['value'])) {
  304. $ret[] = $this->implodeTokens($token['value']);
  305. }
  306. else {
  307. $ret[] = $token['value'];
  308. }
  309. }
  310. return implode(' ', $ret);
  311. }
  312. /**
  313. * Method for preprocessing search keys.
  314. */
  315. protected function processKeys(&$keys) {
  316. if (is_array($keys)) {
  317. foreach ($keys as $key => &$v) {
  318. if (element_child($key)) {
  319. $this->processKeys($v);
  320. if (!$v && !is_numeric($v)) {
  321. unset($keys[$key]);
  322. }
  323. }
  324. }
  325. }
  326. else {
  327. $this->processKey($keys);
  328. }
  329. }
  330. /**
  331. * Method for preprocessing query filters.
  332. */
  333. protected function processFilters(array &$filters) {
  334. $fields = $this->index->options['fields'];
  335. foreach ($filters as $key => &$f) {
  336. if (is_array($f)) {
  337. if (isset($fields[$f[0]]) && $this->testField($f[0], $fields[$f[0]])) {
  338. // We want to allow processors also to easily remove complete filters.
  339. // However, we can't use empty() or the like, as that would sort out
  340. // filters for 0 or NULL. So we specifically check only for the empty
  341. // string, and we also make sure the filter value was actually changed
  342. // by storing whether it was empty before.
  343. $empty_string = $f[1] === '';
  344. $this->processFilterValue($f[1]);
  345. if ($f[1] === '' && !$empty_string) {
  346. unset($filters[$key]);
  347. }
  348. }
  349. }
  350. else {
  351. $child_filters = &$f->getFilters();
  352. $this->processFilters($child_filters);
  353. }
  354. }
  355. }
  356. /**
  357. * @param $name
  358. * The field's machine name.
  359. * @param array $field
  360. * The field's information.
  361. *
  362. * @return
  363. * TRUE, iff the field should be processed.
  364. */
  365. protected function testField($name, array $field) {
  366. if (empty($this->options['fields'])) {
  367. return $this->testType($field['type']);
  368. }
  369. return !empty($this->options['fields'][$name]);
  370. }
  371. /**
  372. * @return
  373. * TRUE, iff the type should be processed.
  374. */
  375. protected function testType($type) {
  376. return search_api_is_text_type($type, array('text', 'tokens'));
  377. }
  378. /**
  379. * Called for processing a single text element in a field. The default
  380. * implementation just calls process().
  381. *
  382. * $value can either be left a string, or changed into an array of tokens. A
  383. * token is an associative array containing:
  384. * - value: Either the text inside the token, or a nested array of tokens. The
  385. * score of nested tokens will be multiplied by their parent's score.
  386. * - score: The relative importance of the token, as a float, with 1 being
  387. * the default.
  388. */
  389. protected function processFieldValue(&$value) {
  390. $this->process($value);
  391. }
  392. /**
  393. * Called for processing a single search keyword. The default implementation
  394. * just calls process().
  395. *
  396. * $value can either be left a string, or be changed into a nested keys array,
  397. * as defined by SearchApiQueryInterface.
  398. */
  399. protected function processKey(&$value) {
  400. $this->process($value);
  401. }
  402. /**
  403. * Called for processing a single filter value. The default implementation
  404. * just calls process().
  405. *
  406. * $value has to remain a string.
  407. */
  408. protected function processFilterValue(&$value) {
  409. $this->process($value);
  410. }
  411. /**
  412. * Function that is ultimately called for all text by the standard
  413. * implementation, and does nothing by default.
  414. *
  415. * @param $value
  416. * The value to preprocess as a string. Can be manipulated directly, nothing
  417. * has to be returned. Since this can be called for all value types, $value
  418. * has to remain a string.
  419. */
  420. protected function process(&$value) {
  421. }
  422. }