SearchQuery.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649
  1. <?php
  2. namespace Drupal\search;
  3. use Drupal\Core\Database\Query\Condition;
  4. use Drupal\Component\Utility\Unicode;
  5. use Drupal\Core\Database\Query\SelectExtender;
  6. use Drupal\Core\Database\Query\SelectInterface;
  7. /**
  8. * Search query extender and helper functions.
  9. *
  10. * Performs a query on the full-text search index for a word or words.
  11. *
  12. * This query is used by search plugins that use the search index (not all
  13. * search plugins do, as some use a different searching mechanism). It
  14. * assumes you have set up a query on the {search_index} table with alias 'i',
  15. * and will only work if the user is searching for at least one "positive"
  16. * keyword or phrase.
  17. *
  18. * For efficiency, users of this query can run the prepareAndNormalize()
  19. * method to figure out if there are any search results, before fully setting
  20. * up and calling execute() to execute the query. The scoring expressions are
  21. * not needed until the execute() step. However, it's not really necessary
  22. * to do this, because this class's execute() method does that anyway.
  23. *
  24. * During both the prepareAndNormalize() and execute() steps, there can be
  25. * problems. Call getStatus() to figure out if the query is OK or not.
  26. *
  27. * The query object is given the tag 'search_$type' and can be further
  28. * extended with hook_query_alter().
  29. */
  30. class SearchQuery extends SelectExtender {
  31. /**
  32. * Indicates no positive keywords were in the search expression.
  33. *
  34. * Positive keywords are words that are searched for, as opposed to negative
  35. * keywords, which are words that are excluded. To count as a keyword, a
  36. * word must be at least
  37. * \Drupal::config('search.settings')->get('index.minimum_word_size')
  38. * characters.
  39. *
  40. * @see SearchQuery::getStatus()
  41. */
  42. const NO_POSITIVE_KEYWORDS = 1;
  43. /**
  44. * Indicates that part of the search expression was ignored.
  45. *
  46. * To prevent Denial of Service attacks, only
  47. * \Drupal::config('search.settings')->get('and_or_limit') expressions
  48. * (positive keywords, phrases, negative keywords) are allowed; this flag
  49. * indicates that expressions existed past that limit and they were removed.
  50. *
  51. * @see SearchQuery::getStatus()
  52. */
  53. const EXPRESSIONS_IGNORED = 2;
  54. /**
  55. * Indicates that lower-case "or" was in the search expression.
  56. *
  57. * The word "or" in lower case was found in the search expression. This
  58. * probably means someone was trying to do an OR search but used lower-case
  59. * instead of upper-case.
  60. *
  61. * @see SearchQuery::getStatus()
  62. */
  63. const LOWER_CASE_OR = 4;
  64. /**
  65. * Indicates that no positive keyword matches were found.
  66. *
  67. * @see SearchQuery::getStatus()
  68. */
  69. const NO_KEYWORD_MATCHES = 8;
  70. /**
  71. * The keywords and advanced search options that are entered by the user.
  72. *
  73. * @var string
  74. */
  75. protected $searchExpression;
  76. /**
  77. * The type of search (search type).
  78. *
  79. * This maps to the value of the type column in search_index, and is usually
  80. * equal to the machine-readable name of the plugin or the search page.
  81. *
  82. * @var string
  83. */
  84. protected $type;
  85. /**
  86. * Parsed-out positive and negative search keys.
  87. *
  88. * @var array
  89. */
  90. protected $keys = ['positive' => [], 'negative' => []];
  91. /**
  92. * Indicates whether the query conditions are simple or complex (LIKE).
  93. *
  94. * @var bool
  95. */
  96. protected $simple = TRUE;
  97. /**
  98. * Conditions that are used for exact searches.
  99. *
  100. * This is always used for the second step in the query, but is not part of
  101. * the preparation step unless $this->simple is FALSE.
  102. *
  103. * @var DatabaseCondition
  104. */
  105. protected $conditions;
  106. /**
  107. * Indicates how many matches for a search query are necessary.
  108. *
  109. * @var int
  110. */
  111. protected $matches = 0;
  112. /**
  113. * Array of positive search words.
  114. *
  115. * These words have to match against {search_index}.word.
  116. *
  117. * @var array
  118. */
  119. protected $words = [];
  120. /**
  121. * Multiplier to normalize the keyword score.
  122. *
  123. * This value is calculated by the preparation step, and is used as a
  124. * multiplier of the word scores to make sure they are between 0 and 1.
  125. *
  126. * @var float
  127. */
  128. protected $normalize = 0;
  129. /**
  130. * Indicates whether the preparation step has been executed.
  131. *
  132. * @var bool
  133. */
  134. protected $executedPrepare = FALSE;
  135. /**
  136. * A bitmap of status conditions, described in getStatus().
  137. *
  138. * @var int
  139. *
  140. * @see SearchQuery::getStatus()
  141. */
  142. protected $status = 0;
  143. /**
  144. * The word score expressions.
  145. *
  146. * @var array
  147. *
  148. * @see SearchQuery::addScore()
  149. */
  150. protected $scores = [];
  151. /**
  152. * Arguments for the score expressions.
  153. *
  154. * @var array
  155. */
  156. protected $scoresArguments = [];
  157. /**
  158. * The number of 'i.relevance' occurrences in score expressions.
  159. *
  160. * @var int
  161. */
  162. protected $relevance_count = 0;
  163. /**
  164. * Multipliers for score expressions.
  165. *
  166. * @var array
  167. */
  168. protected $multiply = [];
  169. /**
  170. * Sets the search query expression.
  171. *
  172. * @param string $expression
  173. * A search string, which can contain keywords and options.
  174. * @param string $type
  175. * The search type. This maps to {search_index}.type in the database.
  176. *
  177. * @return $this
  178. */
  179. public function searchExpression($expression, $type) {
  180. $this->searchExpression = $expression;
  181. $this->type = $type;
  182. // Add query tag.
  183. $this->addTag('search_' . $type);
  184. // Initialize conditions and status.
  185. $this->conditions = new Condition('AND');
  186. $this->status = 0;
  187. return $this;
  188. }
  189. /**
  190. * Parses the search query into SQL conditions.
  191. *
  192. * Sets up the following variables:
  193. * - $this->keys
  194. * - $this->words
  195. * - $this->conditions
  196. * - $this->simple
  197. * - $this->matches
  198. */
  199. protected function parseSearchExpression() {
  200. // Matches words optionally prefixed by a - sign. A word in this case is
  201. // something between two spaces, optionally quoted.
  202. preg_match_all('/ (-?)("[^"]+"|[^" ]+)/i', ' ' . $this->searchExpression, $keywords, PREG_SET_ORDER);
  203. if (count($keywords) == 0) {
  204. return;
  205. }
  206. // Classify tokens.
  207. $in_or = FALSE;
  208. $limit_combinations = \Drupal::config('search.settings')->get('and_or_limit');
  209. // The first search expression does not count as AND.
  210. $and_count = -1;
  211. $or_count = 0;
  212. foreach ($keywords as $match) {
  213. if ($or_count && $and_count + $or_count >= $limit_combinations) {
  214. // Ignore all further search expressions to prevent Denial-of-Service
  215. // attacks using a high number of AND/OR combinations.
  216. $this->status |= SearchQuery::EXPRESSIONS_IGNORED;
  217. break;
  218. }
  219. // Strip off phrase quotes.
  220. $phrase = FALSE;
  221. if ($match[2]{0} == '"') {
  222. $match[2] = substr($match[2], 1, -1);
  223. $phrase = TRUE;
  224. $this->simple = FALSE;
  225. }
  226. // Simplify keyword according to indexing rules and external
  227. // preprocessors. Use same process as during search indexing, so it
  228. // will match search index.
  229. $words = search_simplify($match[2]);
  230. // Re-explode in case simplification added more words, except when
  231. // matching a phrase.
  232. $words = $phrase ? [$words] : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);
  233. // Negative matches.
  234. if ($match[1] == '-') {
  235. $this->keys['negative'] = array_merge($this->keys['negative'], $words);
  236. }
  237. // OR operator: instead of a single keyword, we store an array of all
  238. // OR'd keywords.
  239. elseif ($match[2] == 'OR' && count($this->keys['positive'])) {
  240. $last = array_pop($this->keys['positive']);
  241. // Starting a new OR?
  242. if (!is_array($last)) {
  243. $last = [$last];
  244. }
  245. $this->keys['positive'][] = $last;
  246. $in_or = TRUE;
  247. $or_count++;
  248. continue;
  249. }
  250. // AND operator: implied, so just ignore it.
  251. elseif ($match[2] == 'AND' || $match[2] == 'and') {
  252. continue;
  253. }
  254. // Plain keyword.
  255. else {
  256. if ($match[2] == 'or') {
  257. // Lower-case "or" instead of "OR" is a warning condition.
  258. $this->status |= SearchQuery::LOWER_CASE_OR;
  259. }
  260. if ($in_or) {
  261. // Add to last element (which is an array).
  262. $this->keys['positive'][count($this->keys['positive']) - 1] = array_merge($this->keys['positive'][count($this->keys['positive']) - 1], $words);
  263. }
  264. else {
  265. $this->keys['positive'] = array_merge($this->keys['positive'], $words);
  266. $and_count++;
  267. }
  268. }
  269. $in_or = FALSE;
  270. }
  271. // Convert keywords into SQL statements.
  272. $has_and = FALSE;
  273. $has_or = FALSE;
  274. // Positive matches.
  275. foreach ($this->keys['positive'] as $key) {
  276. // Group of ORed terms.
  277. if (is_array($key) && count($key)) {
  278. // If we had already found one OR, this is another one AND-ed with the
  279. // first, meaning it is not a simple query.
  280. if ($has_or) {
  281. $this->simple = FALSE;
  282. }
  283. $has_or = TRUE;
  284. $has_new_scores = FALSE;
  285. $queryor = new Condition('OR');
  286. foreach ($key as $or) {
  287. list($num_new_scores) = $this->parseWord($or);
  288. $has_new_scores |= $num_new_scores;
  289. $queryor->condition('d.data', "% $or %", 'LIKE');
  290. }
  291. if (count($queryor)) {
  292. $this->conditions->condition($queryor);
  293. // A group of OR keywords only needs to match once.
  294. $this->matches += ($has_new_scores > 0);
  295. }
  296. }
  297. // Single ANDed term.
  298. else {
  299. $has_and = TRUE;
  300. list($num_new_scores, $num_valid_words) = $this->parseWord($key);
  301. $this->conditions->condition('d.data', "% $key %", 'LIKE');
  302. if (!$num_valid_words) {
  303. $this->simple = FALSE;
  304. }
  305. // Each AND keyword needs to match at least once.
  306. $this->matches += $num_new_scores;
  307. }
  308. }
  309. if ($has_and && $has_or) {
  310. $this->simple = FALSE;
  311. }
  312. // Negative matches.
  313. foreach ($this->keys['negative'] as $key) {
  314. $this->conditions->condition('d.data', "% $key %", 'NOT LIKE');
  315. $this->simple = FALSE;
  316. }
  317. }
  318. /**
  319. * Parses a word or phrase for parseQuery().
  320. *
  321. * Splits a phrase into words. Adds its words to $this->words, if it is not
  322. * already there. Returns a list containing the number of new words found,
  323. * and the total number of words in the phrase.
  324. */
  325. protected function parseWord($word) {
  326. $num_new_scores = 0;
  327. $num_valid_words = 0;
  328. // Determine the scorewords of this word/phrase.
  329. $split = explode(' ', $word);
  330. foreach ($split as $s) {
  331. $num = is_numeric($s);
  332. if ($num || Unicode::strlen($s) >= \Drupal::config('search.settings')->get('index.minimum_word_size')) {
  333. if (!isset($this->words[$s])) {
  334. $this->words[$s] = $s;
  335. $num_new_scores++;
  336. }
  337. $num_valid_words++;
  338. }
  339. }
  340. // Return matching snippet and number of added words.
  341. return [$num_new_scores, $num_valid_words];
  342. }
  343. /**
  344. * Prepares the query and calculates the normalization factor.
  345. *
  346. * After the query is normalized the keywords are weighted to give the results
  347. * a relevancy score. The query is ready for execution after this.
  348. *
  349. * Error and warning conditions can apply. Call getStatus() after calling
  350. * this method to retrieve them.
  351. *
  352. * @return bool
  353. * TRUE if at least one keyword matched the search index; FALSE if not.
  354. */
  355. public function prepareAndNormalize() {
  356. $this->parseSearchExpression();
  357. $this->executedPrepare = TRUE;
  358. if (count($this->words) == 0) {
  359. // Although the query could proceed, there is no point in joining
  360. // with other tables and attempting to normalize if there are no
  361. // keywords present.
  362. $this->status |= SearchQuery::NO_POSITIVE_KEYWORDS;
  363. return FALSE;
  364. }
  365. // Build the basic search query: match the entered keywords.
  366. $or = new Condition('OR');
  367. foreach ($this->words as $word) {
  368. $or->condition('i.word', $word);
  369. }
  370. $this->condition($or);
  371. // Add keyword normalization information to the query.
  372. $this->join('search_total', 't', 'i.word = t.word');
  373. $this
  374. ->condition('i.type', $this->type)
  375. ->groupBy('i.type')
  376. ->groupBy('i.sid');
  377. // If the query is simple, we should have calculated the number of
  378. // matching words we need to find, so impose that criterion. For non-
  379. // simple queries, this condition could lead to incorrectly deciding not
  380. // to continue with the full query.
  381. if ($this->simple) {
  382. $this->having('COUNT(*) >= :matches', [':matches' => $this->matches]);
  383. }
  384. // Clone the query object to calculate normalization.
  385. $normalize_query = clone $this->query;
  386. // For complex search queries, add the LIKE conditions; if the query is
  387. // simple, we do not need them for normalization.
  388. if (!$this->simple) {
  389. $normalize_query->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode');
  390. if (count($this->conditions)) {
  391. $normalize_query->condition($this->conditions);
  392. }
  393. }
  394. // Calculate normalization, which is the max of all the search scores for
  395. // positive keywords in the query. And note that the query could have other
  396. // fields added to it by the user of this extension.
  397. $normalize_query->addExpression('SUM(i.score * t.count)', 'calculated_score');
  398. $result = $normalize_query
  399. ->range(0, 1)
  400. ->orderBy('calculated_score', 'DESC')
  401. ->execute()
  402. ->fetchObject();
  403. if (isset($result->calculated_score)) {
  404. $this->normalize = (float) $result->calculated_score;
  405. }
  406. if ($this->normalize) {
  407. return TRUE;
  408. }
  409. // If the normalization value was zero, that indicates there were no
  410. // matches to the supplied positive keywords.
  411. $this->status |= SearchQuery::NO_KEYWORD_MATCHES;
  412. return FALSE;
  413. }
  414. /**
  415. * {@inheritdoc}
  416. */
  417. public function preExecute(SelectInterface $query = NULL) {
  418. if (!$this->executedPrepare) {
  419. $this->prepareAndNormalize();
  420. }
  421. if (!$this->normalize) {
  422. return FALSE;
  423. }
  424. return parent::preExecute($query);
  425. }
  426. /**
  427. * Adds a custom score expression to the search query.
  428. *
  429. * Score expressions are used to order search results. If no calls to
  430. * addScore() have taken place, a default keyword relevance score will be
  431. * used. However, if at least one call to addScore() has taken place, the
  432. * keyword relevance score is not automatically added.
  433. *
  434. * Note that you must use this method to add ordering to your searches, and
  435. * not call orderBy() directly, when using the SearchQuery extender. This is
  436. * because of the two-pass system the SearchQuery class uses to normalize
  437. * scores.
  438. *
  439. * @param string $score
  440. * The score expression, which should evaluate to a number between 0 and 1.
  441. * The string 'i.relevance' in a score expression will be replaced by a
  442. * measure of keyword relevance between 0 and 1.
  443. * @param array $arguments
  444. * Query arguments needed to provide values to the score expression.
  445. * @param float $multiply
  446. * If set, the score is multiplied with this value. However, all scores
  447. * with multipliers are then divided by the total of all multipliers, so
  448. * that overall, the normalization is maintained.
  449. *
  450. * @return $this
  451. */
  452. public function addScore($score, $arguments = [], $multiply = FALSE) {
  453. if ($multiply) {
  454. $i = count($this->multiply);
  455. // Modify the score expression so it is multiplied by the multiplier,
  456. // with a divisor to renormalize. Note that the ROUND here is necessary
  457. // for PostgreSQL and SQLite in order to ensure that the :multiply_* and
  458. // :total_* arguments are treated as a numeric type, because the
  459. // PostgreSQL PDO driver sometimes puts values in as strings instead of
  460. // numbers in complex expressions like this.
  461. $score = "(ROUND(:multiply_$i, 4)) * COALESCE(($score), 0) / (ROUND(:total_$i, 4))";
  462. // Add an argument for the multiplier. The :total_$i argument is taken
  463. // care of in the execute() method, which is when the total divisor is
  464. // calculated.
  465. $arguments[':multiply_' . $i] = $multiply;
  466. $this->multiply[] = $multiply;
  467. }
  468. // Search scoring needs a way to include a keyword relevance in the score.
  469. // For historical reasons, this is done by putting 'i.relevance' into the
  470. // search expression. So, use string replacement to change this to a
  471. // calculated query expression, counting the number of occurrences so
  472. // in the execute() method we can add arguments.
  473. while (($pos = strpos($score, 'i.relevance')) !== FALSE) {
  474. $pieces = explode('i.relevance', $score, 2);
  475. $score = implode('((ROUND(:normalization_' . $this->relevance_count . ', 4)) * i.score * t.count)', $pieces);
  476. $this->relevance_count++;
  477. }
  478. $this->scores[] = $score;
  479. $this->scoresArguments += $arguments;
  480. return $this;
  481. }
  482. /**
  483. * Executes the search.
  484. *
  485. * The complex conditions are applied to the query including score
  486. * expressions and ordering.
  487. *
  488. * Error and warning conditions can apply. Call getStatus() after calling
  489. * this method to retrieve them.
  490. *
  491. * @return \Drupal\Core\Database\StatementInterface|null
  492. * A query result set containing the results of the query.
  493. */
  494. public function execute() {
  495. if (!$this->preExecute($this)) {
  496. return NULL;
  497. }
  498. // Add conditions to the query.
  499. $this->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type AND i.langcode = d.langcode');
  500. if (count($this->conditions)) {
  501. $this->condition($this->conditions);
  502. }
  503. // Add default score (keyword relevance) if there are not any defined.
  504. if (empty($this->scores)) {
  505. $this->addScore('i.relevance');
  506. }
  507. if (count($this->multiply)) {
  508. // Re-normalize scores with multipliers by dividing by the total of all
  509. // multipliers. The expressions were altered in addScore(), so here just
  510. // add the arguments for the total.
  511. $sum = array_sum($this->multiply);
  512. for ($i = 0; $i < count($this->multiply); $i++) {
  513. $this->scoresArguments[':total_' . $i] = $sum;
  514. }
  515. }
  516. // Add arguments for the keyword relevance normalization number.
  517. $normalization = 1.0 / $this->normalize;
  518. for ($i = 0; $i < $this->relevance_count; $i++) {
  519. $this->scoresArguments[':normalization_' . $i] = $normalization;
  520. }
  521. // Add all scores together to form a query field.
  522. $this->addExpression('SUM(' . implode(' + ', $this->scores) . ')', 'calculated_score', $this->scoresArguments);
  523. // If an order has not yet been set for this query, add a default order
  524. // that sorts by the calculated sum of scores.
  525. if (count($this->getOrderBy()) == 0) {
  526. $this->orderBy('calculated_score', 'DESC');
  527. }
  528. // Add query metadata.
  529. $this
  530. ->addMetaData('normalize', $this->normalize)
  531. ->fields('i', ['type', 'sid']);
  532. return $this->query->execute();
  533. }
  534. /**
  535. * Builds the default count query for SearchQuery.
  536. *
  537. * Since SearchQuery always uses GROUP BY, we can default to a subquery. We
  538. * also add the same conditions as execute() because countQuery() is called
  539. * first.
  540. */
  541. public function countQuery() {
  542. if (!$this->executedPrepare) {
  543. $this->prepareAndNormalize();
  544. }
  545. // Clone the inner query.
  546. $inner = clone $this->query;
  547. // Add conditions to query.
  548. $inner->join('search_dataset', 'd', 'i.sid = d.sid AND i.type = d.type');
  549. if (count($this->conditions)) {
  550. $inner->condition($this->conditions);
  551. }
  552. // Remove existing fields and expressions, they are not needed for a count
  553. // query.
  554. $fields =& $inner->getFields();
  555. $fields = [];
  556. $expressions =& $inner->getExpressions();
  557. $expressions = [];
  558. // Add sid as the only field and count them as a subquery.
  559. $count = db_select($inner->fields('i', ['sid']), NULL, ['target' => 'replica']);
  560. // Add the COUNT() expression.
  561. $count->addExpression('COUNT(*)');
  562. return $count;
  563. }
  564. /**
  565. * Returns the query status bitmap.
  566. *
  567. * @return int
  568. * A bitmap indicating query status. Zero indicates there were no problems.
  569. * A non-zero value is a combination of one or more of the following flags:
  570. * - SearchQuery::NO_POSITIVE_KEYWORDS
  571. * - SearchQuery::EXPRESSIONS_IGNORED
  572. * - SearchQuery::LOWER_CASE_OR
  573. * - SearchQuery::NO_KEYWORD_MATCHES
  574. */
  575. public function getStatus() {
  576. return $this->status;
  577. }
  578. }