search.extender.inc 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. <?php
  2. /**
  3. * @file
  4. * Search query extender and helper functions.
  5. */
  6. /**
  7. * Do a query on the full-text search index for a word or words.
  8. *
  9. * This function is normally only called by each module that supports the
  10. * indexed search (and thus, implements hook_update_index()).
  11. *
  12. * Results are retrieved in two logical passes. However, the two passes are
  13. * joined together into a single query. And in the case of most simple
  14. * queries the second pass is not even used.
  15. *
  16. * The first pass selects a set of all possible matches, which has the benefit
  17. * of also providing the exact result set for simple "AND" or "OR" searches.
  18. *
  19. * The second portion of the query further refines this set by verifying
  20. * advanced text conditions (such as negative or phrase matches).
  21. *
  22. * The used query object has the tag 'search_$module' and can be further
  23. * extended with hook_query_alter().
  24. */
  25. class SearchQuery extends SelectQueryExtender {
  26. /**
  27. * The search query that is used for searching.
  28. *
  29. * @var string
  30. */
  31. protected $searchExpression;
  32. /**
  33. * Type of search (search module).
  34. *
  35. * This maps to the value of the type column in search_index, and is equal
  36. * to the machine-readable name of the module that implements
  37. * hook_search_info().
  38. *
  39. * @var string
  40. */
  41. protected $type;
  42. /**
  43. * Positive and negative search keys.
  44. *
  45. * @var array
  46. */
  47. protected $keys = array('positive' => array(), 'negative' => array());
  48. /**
  49. * Indicates whether the first pass query requires complex conditions (LIKE).
  50. *
  51. * @var boolean.
  52. */
  53. protected $simple = TRUE;
  54. /**
  55. * Conditions that are used for exact searches.
  56. *
  57. * This is always used for the second pass query but not for the first pass,
  58. * unless $this->simple is FALSE.
  59. *
  60. * @var DatabaseCondition
  61. */
  62. protected $conditions;
  63. /**
  64. * Indicates how many matches for a search query are necessary.
  65. *
  66. * @var int
  67. */
  68. protected $matches = 0;
  69. /**
  70. * Array of search words.
  71. *
  72. * These words have to match against {search_index}.word.
  73. *
  74. * @var array
  75. */
  76. protected $words = array();
  77. /**
  78. * Multiplier for the normalized search score.
  79. *
  80. * This value is calculated by the first pass query and multiplied with the
  81. * actual score of a specific word to make sure that the resulting calculated
  82. * score is between 0 and 1.
  83. *
  84. * @var float
  85. */
  86. protected $normalize;
  87. /**
  88. * Indicates whether the first pass query has been executed.
  89. *
  90. * @var boolean
  91. */
  92. protected $executedFirstPass = FALSE;
  93. /**
  94. * Stores score expressions.
  95. *
  96. * @var array
  97. *
  98. * @see addScore()
  99. */
  100. protected $scores = array();
  101. /**
  102. * Stores arguments for score expressions.
  103. *
  104. * @var array
  105. */
  106. protected $scoresArguments = array();
  107. /**
  108. * Stores multipliers for score expressions.
  109. *
  110. * @var array
  111. */
  112. protected $multiply = array();
  113. /**
  114. * Whether or not search expressions were ignored.
  115. *
  116. * The maximum number of AND/OR combinations exceeded can be configured to
  117. * avoid Denial-of-Service attacks. Expressions beyond the limit are ignored.
  118. *
  119. * @var boolean
  120. */
  121. protected $expressionsIgnored = FALSE;
  122. /**
  123. * Sets up the search query expression.
  124. *
  125. * @param $query
  126. * A search query string, which can contain options.
  127. * @param $module
  128. * The search module. This maps to {search_index}.type in the database.
  129. *
  130. * @return
  131. * The SearchQuery object.
  132. */
  133. public function searchExpression($expression, $module) {
  134. $this->searchExpression = $expression;
  135. $this->type = $module;
  136. // Add a search_* tag. This needs to be added before any preExecute methods
  137. // for decorated queries are called, as $this->prepared will be set to TRUE
  138. // and tags added in the execute method will never get used. For example,
  139. // if $query is extended by 'SearchQuery' then 'PagerDefault', the
  140. // search-specific tag will be added too late (when preExecute() has
  141. // already been called from the PagerDefault extender), and as a
  142. // consequence will not be available to hook_query_alter() implementations,
  143. // nor will the correct hook_query_TAG_alter() implementations get invoked.
  144. // See node_search_execute().
  145. $this->addTag('search_' . $module);
  146. return $this;
  147. }
  148. /**
  149. * Applies a search option and removes it from the search query string.
  150. *
  151. * These options are in the form option:value,value2,value3.
  152. *
  153. * @param $option
  154. * Name of the option.
  155. * @param $column
  156. * Name of the database column to which the value should be applied.
  157. *
  158. * @return
  159. * TRUE if a value for that option was found, FALSE if not.
  160. */
  161. public function setOption($option, $column) {
  162. if ($values = search_expression_extract($this->searchExpression, $option)) {
  163. $or = db_or();
  164. foreach (explode(',', $values) as $value) {
  165. $or->condition($column, $value);
  166. }
  167. $this->condition($or);
  168. $this->searchExpression = search_expression_insert($this->searchExpression, $option);
  169. return TRUE;
  170. }
  171. return FALSE;
  172. }
  173. /**
  174. * Parses the search query into SQL conditions.
  175. *
  176. * We build two queries that match the dataset bodies.
  177. */
  178. protected function parseSearchExpression() {
  179. // Matchs words optionally prefixed by a dash. A word in this case is
  180. // something between two spaces, optionally quoted.
  181. preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' ' . $this->searchExpression , $keywords, PREG_SET_ORDER);
  182. if (count($keywords) == 0) {
  183. return;
  184. }
  185. // Classify tokens.
  186. $or = FALSE;
  187. $warning = '';
  188. $limit_combinations = variable_get('search_and_or_limit', 7);
  189. // The first search expression does not count as AND.
  190. $and_count = -1;
  191. $or_count = 0;
  192. foreach ($keywords as $match) {
  193. if ($or_count && $and_count + $or_count >= $limit_combinations) {
  194. // Ignore all further search expressions to prevent Denial-of-Service
  195. // attacks using a high number of AND/OR combinations.
  196. $this->expressionsIgnored = TRUE;
  197. break;
  198. }
  199. $phrase = FALSE;
  200. // Strip off phrase quotes.
  201. if ($match[2][0] == '"') {
  202. $match[2] = substr($match[2], 1, -1);
  203. $phrase = TRUE;
  204. $this->simple = FALSE;
  205. }
  206. // Simplify keyword according to indexing rules and external
  207. // preprocessors. Use same process as during search indexing, so it
  208. // will match search index.
  209. $words = search_simplify($match[2]);
  210. // Re-explode in case simplification added more words, except when
  211. // matching a phrase.
  212. $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
  213. // Negative matches.
  214. if ($match[1] == '-') {
  215. $this->keys['negative'] = array_merge($this->keys['negative'], $words);
  216. }
  217. // OR operator: instead of a single keyword, we store an array of all
  218. // OR'd keywords.
  219. elseif ($match[2] == 'OR' && count($this->keys['positive'])) {
  220. $last = array_pop($this->keys['positive']);
  221. // Starting a new OR?
  222. if (!is_array($last)) {
  223. $last = array($last);
  224. }
  225. $this->keys['positive'][] = $last;
  226. $or = TRUE;
  227. $or_count++;
  228. continue;
  229. }
  230. // AND operator: implied, so just ignore it.
  231. elseif ($match[2] == 'AND' || $match[2] == 'and') {
  232. $warning = $match[2];
  233. continue;
  234. }
  235. // Plain keyword.
  236. else {
  237. if ($match[2] == 'or') {
  238. $warning = $match[2];
  239. }
  240. if ($or) {
  241. // Add to last element (which is an array).
  242. $this->keys['positive'][count($this->keys['positive']) - 1] = array_merge($this->keys['positive'][count($this->keys['positive']) - 1], $words);
  243. }
  244. else {
  245. $this->keys['positive'] = array_merge($this->keys['positive'], $words);
  246. $and_count++;
  247. }
  248. }
  249. $or = FALSE;
  250. }
  251. // Convert keywords into SQL statements.
  252. $this->conditions = db_and();
  253. $simple_and = FALSE;
  254. $simple_or = FALSE;
  255. // Positive matches.
  256. foreach ($this->keys['positive'] as $key) {
  257. // Group of ORed terms.
  258. if (is_array($key) && count($key)) {
  259. $simple_or = TRUE;
  260. $any = FALSE;
  261. $queryor = db_or();
  262. foreach ($key as $or) {
  263. list($num_new_scores) = $this->parseWord($or);
  264. $any |= $num_new_scores;
  265. $queryor->condition('d.data', "% $or %", 'LIKE');
  266. }
  267. if (count($queryor)) {
  268. $this->conditions->condition($queryor);
  269. // A group of OR keywords only needs to match once.
  270. $this->matches += ($any > 0);
  271. }
  272. }
  273. // Single ANDed term.
  274. else {
  275. $simple_and = TRUE;
  276. list($num_new_scores, $num_valid_words) = $this->parseWord($key);
  277. $this->conditions->condition('d.data', "% $key %", 'LIKE');
  278. if (!$num_valid_words) {
  279. $this->simple = FALSE;
  280. }
  281. // Each AND keyword needs to match at least once.
  282. $this->matches += $num_new_scores;
  283. }
  284. }
  285. if ($simple_and && $simple_or) {
  286. $this->simple = FALSE;
  287. }
  288. // Negative matches.
  289. foreach ($this->keys['negative'] as $key) {
  290. $this->conditions->condition('d.data', "% $key %", 'NOT LIKE');
  291. $this->simple = FALSE;
  292. }
  293. if ($warning == 'or') {
  294. drupal_set_message(t('Search for either of the two terms with uppercase <strong>OR</strong>. For example, <strong>cats OR dogs</strong>.'));
  295. }
  296. }
  297. /**
  298. * Helper function for parseQuery().
  299. */
  300. protected function parseWord($word) {
  301. $num_new_scores = 0;
  302. $num_valid_words = 0;
  303. // Determine the scorewords of this word/phrase.
  304. $split = explode(' ', $word);
  305. foreach ($split as $s) {
  306. $num = is_numeric($s);
  307. if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 3)) {
  308. if (!isset($this->words[$s])) {
  309. $this->words[$s] = $s;
  310. $num_new_scores++;
  311. }
  312. $num_valid_words++;
  313. }
  314. }
  315. // Return matching snippet and number of added words.
  316. return array($num_new_scores, $num_valid_words);
  317. }
  318. /**
  319. * Executes the first pass query.
  320. *
  321. * This can either be done explicitly, so that additional scores and
  322. * conditions can be applied to the second pass query, or implicitly by
  323. * addScore() or execute().
  324. *
  325. * @return
  326. * TRUE if search items exist, FALSE if not.
  327. */
  328. public function executeFirstPass() {
  329. $this->parseSearchExpression();
  330. if (count($this->words) == 0) {
  331. form_set_error('keys', format_plural(variable_get('minimum_word_size', 3), 'You must include at least one positive keyword with 1 character or more.', 'You must include at least one positive keyword with @count characters or more.'));
  332. return FALSE;
  333. }
  334. if ($this->expressionsIgnored) {
  335. drupal_set_message(t('Your search used too many AND/OR expressions. Only the first @count terms were included in this search.', array('@count' => variable_get('search_and_or_limit', 7))), 'warning');
  336. }
  337. $this->executedFirstPass = TRUE;
  338. if (!empty($this->words)) {
  339. $or = db_or();
  340. foreach ($this->words as $word) {
  341. $or->condition('i.word', $word);
  342. }
  343. $this->condition($or);
  344. }
  345. // Build query for keyword normalization.
  346. $this->join('search_total', 't', 'i.word = t.word');
  347. $this
  348. ->condition('i.type', $this->type)
  349. ->groupBy('i.type')
  350. ->groupBy('i.sid')
  351. ->having('COUNT(*) >= :matches', array(':matches' => $this->matches));
  352. // Clone the query object to do the firstPass query;
  353. $first = clone $this->query;
  354. // For complex search queries, add the LIKE conditions to the first pass query.
  355. if (!$this->simple) {
  356. $first->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
  357. $first->condition($this->conditions);
  358. }
  359. // Calculate maximum keyword relevance, to normalize it.
  360. $first->addExpression('SUM(i.score * t.count)', 'calculated_score');
  361. $this->normalize = $first
  362. ->range(0, 1)
  363. ->orderBy('calculated_score', 'DESC')
  364. ->execute()
  365. ->fetchField();
  366. if ($this->normalize) {
  367. return TRUE;
  368. }
  369. return FALSE;
  370. }
  371. /**
  372. * Adds a custom score expression to the search query.
  373. *
  374. * Score expressions are used to order search results. If no calls to
  375. * addScore() have taken place, a default keyword relevance score will be
  376. * used. However, if at least one call to addScore() has taken place, the
  377. * keyword relevance score is not automatically added.
  378. *
  379. * Note that you must use this method to add ordering to your searches, and
  380. * not call orderBy() directly, when using the SearchQuery extender. This is
  381. * because of the two-pass system the SearchQuery class uses to normalize
  382. * scores.
  383. *
  384. * @param $score
  385. * The score expression, which should evaluate to a number between 0 and 1.
  386. * The string 'i.relevance' in a score expression will be replaced by a
  387. * measure of keyword relevance between 0 and 1.
  388. * @param $arguments
  389. * Query arguments needed to provide values to the score expression.
  390. * @param $multiply
  391. * If set, the score is multiplied with this value. However, all scores
  392. * with multipliers are then divided by the total of all multipliers, so
  393. * that overall, the normalization is maintained.
  394. *
  395. * @return object
  396. * The updated query object.
  397. */
  398. public function addScore($score, $arguments = array(), $multiply = FALSE) {
  399. if ($multiply) {
  400. $i = count($this->multiply);
  401. // Modify the score expression so it is multiplied by the multiplier,
  402. // with a divisor to renormalize.
  403. $score = "CAST(:multiply_$i AS DECIMAL) * COALESCE(( " . $score . "), 0) / CAST(:total_$i AS DECIMAL)";
  404. // Add an argument for the multiplier. The :total_$i argument is taken
  405. // care of in the execute() method, which is when the total divisor is
  406. // calculated.
  407. $arguments[':multiply_' . $i] = $multiply;
  408. $this->multiply[] = $multiply;
  409. }
  410. $this->scores[] = $score;
  411. $this->scoresArguments += $arguments;
  412. return $this;
  413. }
  414. /**
  415. * Executes the search.
  416. *
  417. * If not already done, this executes the first pass query. Then the complex
  418. * conditions are applied to the query including score expressions and
  419. * ordering.
  420. *
  421. * @return
  422. * FALSE if the first pass query returned no results, and a database result
  423. * set if there were results.
  424. */
  425. public function execute()
  426. {
  427. if (!$this->executedFirstPass) {
  428. $this->executeFirstPass();
  429. }
  430. if (!$this->normalize) {
  431. return new DatabaseStatementEmpty();
  432. }
  433. // Add conditions to query.
  434. $this->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
  435. $this->condition($this->conditions);
  436. if (empty($this->scores)) {
  437. // Add default score.
  438. $this->addScore('i.relevance');
  439. }
  440. if (count($this->multiply)) {
  441. // Re-normalize scores with multipliers by dividing by the total of all
  442. // multipliers. The expressions were altered in addScore(), so here just
  443. // add the arguments for the total.
  444. $i = 0;
  445. $sum = array_sum($this->multiply);
  446. foreach ($this->multiply as $total) {
  447. $this->scoresArguments[':total_' . $i] = $sum;
  448. $i++;
  449. }
  450. }
  451. // Replace the pseudo-expression 'i.relevance' with a measure of keyword
  452. // relevance in all score expressions, using string replacement. Careful
  453. // though! If you just print out a float, some locales use ',' as the
  454. // decimal separator in PHP, while SQL always uses '.'. So, make sure to
  455. // set the number format correctly.
  456. $relevance = number_format((1.0 / $this->normalize), 10, '.', '');
  457. $this->scores = str_replace('i.relevance', '(' . $relevance . ' * i.score * t.count)', $this->scores);
  458. // Add all scores together to form a query field.
  459. $this->addExpression('SUM(' . implode(' + ', $this->scores) . ')', 'calculated_score', $this->scoresArguments);
  460. // If an order has not yet been set for this query, add a default order
  461. // that sorts by the calculated sum of scores.
  462. if (count($this->getOrderBy()) == 0) {
  463. $this->orderBy('calculated_score', 'DESC');
  464. }
  465. // Add useful metadata.
  466. $this
  467. ->addMetaData('normalize', $this->normalize)
  468. ->fields('i', array('type', 'sid'));
  469. return $this->query->execute();
  470. }
  471. /**
  472. * Builds the default count query for SearchQuery.
  473. *
  474. * Since SearchQuery always uses GROUP BY, we can default to a subquery. We
  475. * also add the same conditions as execute() because countQuery() is called
  476. * first.
  477. */
  478. public function countQuery() {
  479. // Clone the inner query.
  480. $inner = clone $this->query;
  481. // Add conditions to query.
  482. $inner->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
  483. $inner->condition($this->conditions);
  484. // Remove existing fields and expressions, they are not needed for a count
  485. // query.
  486. $fields =& $inner->getFields();
  487. $fields = array();
  488. $expressions =& $inner->getExpressions();
  489. $expressions = array();
  490. // Add the sid as the only field and count them as a subquery.
  491. $count = db_select($inner->fields('i', array('sid')), NULL, array('target' => 'slave'));
  492. // Add the COUNT() expression.
  493. $count->addExpression('COUNT(*)');
  494. return $count;
  495. }
  496. }