search.extender.inc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. <?php
  2. /**
  3. * @file
  4. * Search query extender and helper functions.
  5. */
  6. /**
  7. * Do a query on the full-text search index for a word or words.
  8. *
  9. * This function is normally only called by each module that supports the
  10. * indexed search (and thus, implements hook_update_index()).
  11. *
  12. * Results are retrieved in two logical passes. However, the two passes are
  13. * joined together into a single query. And in the case of most simple
  14. * queries the second pass is not even used.
  15. *
  16. * The first pass selects a set of all possible matches, which has the benefit
  17. * of also providing the exact result set for simple "AND" or "OR" searches.
  18. *
  19. * The second portion of the query further refines this set by verifying
  20. * advanced text conditions (such as negative or phrase matches).
  21. *
  22. * The used query object has the tag 'search_$module' and can be further
  23. * extended with hook_query_alter().
  24. */
  25. class SearchQuery extends SelectQueryExtender {
  26. /**
  27. * The search query that is used for searching.
  28. *
  29. * @var string
  30. */
  31. protected $searchExpression;
  32. /**
  33. * Type of search (search module).
  34. *
  35. * This maps to the value of the type column in search_index, and is equal
  36. * to the machine-readable name of the module that implements
  37. * hook_search_info().
  38. *
  39. * @var string
  40. */
  41. protected $type;
  42. /**
  43. * Positive and negative search keys.
  44. *
  45. * @var array
  46. */
  47. protected $keys = array('positive' => array(), 'negative' => array());
  48. /**
  49. * Indicates whether the first pass query requires complex conditions (LIKE).
  50. *
  51. * @var boolean.
  52. */
  53. protected $simple = TRUE;
  54. /**
  55. * Conditions that are used for exact searches.
  56. *
  57. * This is always used for the second pass query but not for the first pass,
  58. * unless $this->simple is FALSE.
  59. *
  60. * @var DatabaseCondition
  61. */
  62. protected $conditions;
  63. /**
  64. * Indicates how many matches for a search query are necessary.
  65. *
  66. * @var int
  67. */
  68. protected $matches = 0;
  69. /**
  70. * Array of search words.
  71. *
  72. * These words have to match against {search_index}.word.
  73. *
  74. * @var array
  75. */
  76. protected $words = array();
  77. /**
  78. * Multiplier for the normalized search score.
  79. *
  80. * This value is calculated by the first pass query and multiplied with the
  81. * actual score of a specific word to make sure that the resulting calculated
  82. * score is between 0 and 1.
  83. *
  84. * @var float
  85. */
  86. protected $normalize;
  87. /**
  88. * Indicates whether the first pass query has been executed.
  89. *
  90. * @var boolean
  91. */
  92. protected $executedFirstPass = FALSE;
  93. /**
  94. * Stores score expressions.
  95. *
  96. * @var array
  97. */
  98. protected $scores = array();
  99. /**
  100. * Stores arguments for score expressions.
  101. *
  102. * @var array
  103. */
  104. protected $scoresArguments = array();
  105. /**
  106. * Total value of all the multipliers.
  107. *
  108. * @var array
  109. */
  110. protected $multiply = array();
  111. /**
  112. * Whether or not search expressions were ignored.
  113. *
  114. * The maximum number of AND/OR combinations exceeded can be configured to
  115. * avoid Denial-of-Service attacks. Expressions beyond the limit are ignored.
  116. *
  117. * @var boolean
  118. */
  119. protected $expressionsIgnored = FALSE;
  120. /**
  121. * Sets up the search query expression.
  122. *
  123. * @param $query
  124. * A search query string, which can contain options.
  125. * @param $module
  126. * The search module. This maps to {search_index}.type in the database.
  127. *
  128. * @return
  129. * The SearchQuery object.
  130. */
  131. public function searchExpression($expression, $module) {
  132. $this->searchExpression = $expression;
  133. $this->type = $module;
  134. return $this;
  135. }
  136. /**
  137. * Applies a search option and removes it from the search query string.
  138. *
  139. * These options are in the form option:value,value2,value3.
  140. *
  141. * @param $option
  142. * Name of the option.
  143. * @param $column
  144. * Name of the database column to which the value should be applied.
  145. *
  146. * @return
  147. * TRUE if a value for that option was found, FALSE if not.
  148. */
  149. public function setOption($option, $column) {
  150. if ($values = search_expression_extract($this->searchExpression, $option)) {
  151. $or = db_or();
  152. foreach (explode(',', $values) as $value) {
  153. $or->condition($column, $value);
  154. }
  155. $this->condition($or);
  156. $this->searchExpression = search_expression_insert($this->searchExpression, $option);
  157. return TRUE;
  158. }
  159. return FALSE;
  160. }
  161. /**
  162. * Parses the search query into SQL conditions.
  163. *
  164. * We build two queries that match the dataset bodies.
  165. */
  166. protected function parseSearchExpression() {
  167. // Matchs words optionally prefixed by a dash. A word in this case is
  168. // something between two spaces, optionally quoted.
  169. preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' ' . $this->searchExpression , $keywords, PREG_SET_ORDER);
  170. if (count($keywords) == 0) {
  171. return;
  172. }
  173. // Classify tokens.
  174. $or = FALSE;
  175. $warning = '';
  176. $limit_combinations = variable_get('search_and_or_limit', 7);
  177. // The first search expression does not count as AND.
  178. $and_count = -1;
  179. $or_count = 0;
  180. foreach ($keywords as $match) {
  181. if ($or_count && $and_count + $or_count >= $limit_combinations) {
  182. // Ignore all further search expressions to prevent Denial-of-Service
  183. // attacks using a high number of AND/OR combinations.
  184. $this->expressionsIgnored = TRUE;
  185. break;
  186. }
  187. $phrase = FALSE;
  188. // Strip off phrase quotes.
  189. if ($match[2]{0} == '"') {
  190. $match[2] = substr($match[2], 1, -1);
  191. $phrase = TRUE;
  192. $this->simple = FALSE;
  193. }
  194. // Simplify keyword according to indexing rules and external
  195. // preprocessors. Use same process as during search indexing, so it
  196. // will match search index.
  197. $words = search_simplify($match[2]);
  198. // Re-explode in case simplification added more words, except when
  199. // matching a phrase.
  200. $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
  201. // Negative matches.
  202. if ($match[1] == '-') {
  203. $this->keys['negative'] = array_merge($this->keys['negative'], $words);
  204. }
  205. // OR operator: instead of a single keyword, we store an array of all
  206. // OR'd keywords.
  207. elseif ($match[2] == 'OR' && count($this->keys['positive'])) {
  208. $last = array_pop($this->keys['positive']);
  209. // Starting a new OR?
  210. if (!is_array($last)) {
  211. $last = array($last);
  212. }
  213. $this->keys['positive'][] = $last;
  214. $or = TRUE;
  215. $or_count++;
  216. continue;
  217. }
  218. // AND operator: implied, so just ignore it.
  219. elseif ($match[2] == 'AND' || $match[2] == 'and') {
  220. $warning = $match[2];
  221. continue;
  222. }
  223. // Plain keyword.
  224. else {
  225. if ($match[2] == 'or') {
  226. $warning = $match[2];
  227. }
  228. if ($or) {
  229. // Add to last element (which is an array).
  230. $this->keys['positive'][count($this->keys['positive']) - 1] = array_merge($this->keys['positive'][count($this->keys['positive']) - 1], $words);
  231. }
  232. else {
  233. $this->keys['positive'] = array_merge($this->keys['positive'], $words);
  234. $and_count++;
  235. }
  236. }
  237. $or = FALSE;
  238. }
  239. // Convert keywords into SQL statements.
  240. $this->conditions = db_and();
  241. $simple_and = FALSE;
  242. $simple_or = FALSE;
  243. // Positive matches.
  244. foreach ($this->keys['positive'] as $key) {
  245. // Group of ORed terms.
  246. if (is_array($key) && count($key)) {
  247. $simple_or = TRUE;
  248. $any = FALSE;
  249. $queryor = db_or();
  250. foreach ($key as $or) {
  251. list($num_new_scores) = $this->parseWord($or);
  252. $any |= $num_new_scores;
  253. $queryor->condition('d.data', "% $or %", 'LIKE');
  254. }
  255. if (count($queryor)) {
  256. $this->conditions->condition($queryor);
  257. // A group of OR keywords only needs to match once.
  258. $this->matches += ($any > 0);
  259. }
  260. }
  261. // Single ANDed term.
  262. else {
  263. $simple_and = TRUE;
  264. list($num_new_scores, $num_valid_words) = $this->parseWord($key);
  265. $this->conditions->condition('d.data', "% $key %", 'LIKE');
  266. if (!$num_valid_words) {
  267. $this->simple = FALSE;
  268. }
  269. // Each AND keyword needs to match at least once.
  270. $this->matches += $num_new_scores;
  271. }
  272. }
  273. if ($simple_and && $simple_or) {
  274. $this->simple = FALSE;
  275. }
  276. // Negative matches.
  277. foreach ($this->keys['negative'] as $key) {
  278. $this->conditions->condition('d.data', "% $key %", 'NOT LIKE');
  279. $this->simple = FALSE;
  280. }
  281. if ($warning == 'or') {
  282. drupal_set_message(t('Search for either of the two terms with uppercase <strong>OR</strong>. For example, <strong>cats OR dogs</strong>.'));
  283. }
  284. }
  285. /**
  286. * Helper function for parseQuery().
  287. */
  288. protected function parseWord($word) {
  289. $num_new_scores = 0;
  290. $num_valid_words = 0;
  291. // Determine the scorewords of this word/phrase.
  292. $split = explode(' ', $word);
  293. foreach ($split as $s) {
  294. $num = is_numeric($s);
  295. if ($num || drupal_strlen($s) >= variable_get('minimum_word_size', 3)) {
  296. if (!isset($this->words[$s])) {
  297. $this->words[$s] = $s;
  298. $num_new_scores++;
  299. }
  300. $num_valid_words++;
  301. }
  302. }
  303. // Return matching snippet and number of added words.
  304. return array($num_new_scores, $num_valid_words);
  305. }
  306. /**
  307. * Executes the first pass query.
  308. *
  309. * This can either be done explicitly, so that additional scores and
  310. * conditions can be applied to the second pass query, or implicitly by
  311. * addScore() or execute().
  312. *
  313. * @return
  314. * TRUE if search items exist, FALSE if not.
  315. */
  316. public function executeFirstPass() {
  317. $this->parseSearchExpression();
  318. if (count($this->words) == 0) {
  319. form_set_error('keys', format_plural(variable_get('minimum_word_size', 3), 'You must include at least one positive keyword with 1 character or more.', 'You must include at least one positive keyword with @count characters or more.'));
  320. return FALSE;
  321. }
  322. if ($this->expressionsIgnored) {
  323. drupal_set_message(t('Your search used too many AND/OR expressions. Only the first @count terms were included in this search.', array('@count' => variable_get('search_and_or_limit', 7))), 'warning');
  324. }
  325. $this->executedFirstPass = TRUE;
  326. if (!empty($this->words)) {
  327. $or = db_or();
  328. foreach ($this->words as $word) {
  329. $or->condition('i.word', $word);
  330. }
  331. $this->condition($or);
  332. }
  333. // Build query for keyword normalization.
  334. $this->join('search_total', 't', 'i.word = t.word');
  335. $this
  336. ->condition('i.type', $this->type)
  337. ->groupBy('i.type')
  338. ->groupBy('i.sid')
  339. ->having('COUNT(*) >= :matches', array(':matches' => $this->matches));
  340. // Clone the query object to do the firstPass query;
  341. $first = clone $this->query;
  342. // For complex search queries, add the LIKE conditions to the first pass query.
  343. if (!$this->simple) {
  344. $first->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
  345. $first->condition($this->conditions);
  346. }
  347. // Calculate maximum keyword relevance, to normalize it.
  348. $first->addExpression('SUM(i.score * t.count)', 'calculated_score');
  349. $this->normalize = $first
  350. ->range(0, 1)
  351. ->orderBy('calculated_score', 'DESC')
  352. ->execute()
  353. ->fetchField();
  354. if ($this->normalize) {
  355. return TRUE;
  356. }
  357. return FALSE;
  358. }
  359. /**
  360. * Adds a custom score expression to the search query.
  361. *
  362. * Each score expression can optionally use a multiplier, and multiple
  363. * expressions are combined.
  364. *
  365. * @param $score
  366. * The score expression.
  367. * @param $arguments
  368. * Custom query arguments for that expression.
  369. * @param $multiply
  370. * If set, the score is multiplied with that value. Search query ensures
  371. * that the search scores are still normalized.
  372. */
  373. public function addScore($score, $arguments = array(), $multiply = FALSE) {
  374. if ($multiply) {
  375. $i = count($this->multiply);
  376. $score = "CAST(:multiply_$i AS DECIMAL) * COALESCE(( " . $score . "), 0) / CAST(:total_$i AS DECIMAL)";
  377. $arguments[':multiply_' . $i] = $multiply;
  378. $this->multiply[] = $multiply;
  379. }
  380. $this->scores[] = $score;
  381. $this->scoresArguments += $arguments;
  382. return $this;
  383. }
  384. /**
  385. * Executes the search.
  386. *
  387. * If not already done, this executes the first pass query. Then the complex
  388. * conditions are applied to the query including score expressions and
  389. * ordering.
  390. *
  391. * @return
  392. * FALSE if the first pass query returned no results, and a database result
  393. * set if there were results.
  394. */
  395. public function execute()
  396. {
  397. if (!$this->executedFirstPass) {
  398. $this->executeFirstPass();
  399. }
  400. if (!$this->normalize) {
  401. return new DatabaseStatementEmpty();
  402. }
  403. // Add conditions to query.
  404. $this->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
  405. $this->condition($this->conditions);
  406. if (empty($this->scores)) {
  407. // Add default score.
  408. $this->addScore('i.relevance');
  409. }
  410. if (count($this->multiply)) {
  411. // Add the total multiplicator as many times as requested to maintain
  412. // normalization as far as possible.
  413. $i = 0;
  414. $sum = array_sum($this->multiply);
  415. foreach ($this->multiply as $total) {
  416. $this->scoresArguments[':total_' . $i] = $sum;
  417. $i++;
  418. }
  419. }
  420. // Replace i.relevance pseudo-field with the actual, normalized value.
  421. $this->scores = str_replace('i.relevance', '(' . (1.0 / $this->normalize) . ' * i.score * t.count)', $this->scores);
  422. // Convert scores to an expression.
  423. $this->addExpression('SUM(' . implode(' + ', $this->scores) . ')', 'calculated_score', $this->scoresArguments);
  424. if (count($this->getOrderBy()) == 0) {
  425. // Add default order after adding the expression.
  426. $this->orderBy('calculated_score', 'DESC');
  427. }
  428. // Add tag and useful metadata.
  429. $this
  430. ->addTag('search_' . $this->type)
  431. ->addMetaData('normalize', $this->normalize)
  432. ->fields('i', array('type', 'sid'));
  433. return $this->query->execute();
  434. }
  435. /**
  436. * Builds the default count query for SearchQuery.
  437. *
  438. * Since SearchQuery always uses GROUP BY, we can default to a subquery. We
  439. * also add the same conditions as execute() because countQuery() is called
  440. * first.
  441. */
  442. public function countQuery() {
  443. // Clone the inner query.
  444. $inner = clone $this->query;
  445. // Add conditions to query.
  446. $inner->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
  447. $inner->condition($this->conditions);
  448. // Remove existing fields and expressions, they are not needed for a count
  449. // query.
  450. $fields =& $inner->getFields();
  451. $fields = array();
  452. $expressions =& $inner->getExpressions();
  453. $expressions = array();
  454. // Add the sid as the only field and count them as a subquery.
  455. $count = db_select($inner->fields('i', array('sid')), NULL, array('target' => 'slave'));
  456. // Add the COUNT() expression.
  457. $count->addExpression('COUNT(*)');
  458. return $count;
  459. }
  460. }