document.inc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. <?php
  2. /**
  3. * Copyright (c) 2007-2009, Conduit Internet Technologies, Inc.
  4. * All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions are met:
  8. *
  9. * - Redistributions of source code must retain the above copyright notice,
  10. * this list of conditions and the following disclaimer.
  11. * - Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * - Neither the name of Conduit Internet Technologies, Inc. nor the names of
  15. * its contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  22. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28. * POSSIBILITY OF SUCH DAMAGE.
  29. *
  30. * @copyright Copyright 2007-2009 Conduit Internet Technologies, Inc. (http://conduit-it.com)
  31. * @license New BSD (http://solr-php-client.googlecode.com/svn/trunk/COPYING)
  32. * @version $Id: Document.php 15 2009-08-04 17:53:08Z donovan.jimenez $
  33. *
  34. * @package Apache
  35. * @subpackage Solr
  36. * @author Donovan Jimenez <djimenez@conduit-it.com>
  37. */
  38. /**
  39. * Additional code Copyright (c) 2011 by Peter Wolanin, and
  40. * additional contributors.
  41. *
  42. * This program is free software; you can redistribute it and/or modify
  43. * it under the terms of the GNU General Public License as published by
  44. * the Free Software Foundation; either version 2 of the License, or (at
  45. * your option) any later version.
  46. *
  47. * This program is distributed in the hope that it will be useful, but
  48. * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  49. * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  50. * for more details.
  51. *
  52. * You should have received a copy of the GNU General Public License
  53. * along with this program as the file LICENSE.txt; if not, please see
  54. * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
  55. */
  56. /**
  57. * Holds Key / Value pairs that represent a Solr Document along with any
  58. * associated boost values. Field values can be accessed by direct dereferencing
  59. * such as:
  60. *
  61. * @code
  62. * $document->title = 'Something';
  63. * echo $document->title;
  64. * @endcode
  65. *
  66. * Additionally, the field values can be iterated with foreach:
  67. *
  68. * @code
  69. * foreach ($document as $fieldName => $fieldValue) {
  70. * // ...
  71. * }
  72. * @endcode
  73. */
  74. class SearchApiSolrDocument implements IteratorAggregate {
  75. /**
  76. * Document boost value.
  77. *
  78. * @var float|false
  79. */
  80. protected $documentBoost = FALSE;
  81. /**
  82. * Document field values, indexed by name.
  83. *
  84. * @var array
  85. */
  86. protected $fields = array();
  87. /**
  88. * Document field boost values, indexed by name.
  89. *
  90. * @var array
  91. */
  92. protected $fieldBoosts = array();
  93. /**
  94. * Document field update values, indexed by name.
  95. *
  96. * @var array
  97. */
  98. protected $fieldUpdates = array();
  99. /**
  100. * Document nested objects.
  101. *
  102. * @var SearchApiSolrDocument[]
  103. */
  104. protected $nestedObjects = array();
  105. /**
  106. * Clears all boosts and fields from this document.
  107. */
  108. public function clear() {
  109. $this->documentBoost = FALSE;
  110. $this->fields = array();
  111. $this->fieldBoosts = array();
  112. $this->fieldUpdates = array();
  113. $this->nestedObjects = array();
  114. }
  115. /**
  116. * Gets the current document boost.
  117. *
  118. * @return float|false
  119. * The current document boost, or FALSE if none is set.
  120. */
  121. public function getBoost() {
  122. return $this->documentBoost;
  123. }
  124. /**
  125. * Sets the document boost factor.
  126. *
  127. * @param float|false $boost
  128. * FALSE for default boost, or a positive number for setting a document
  129. * boost.
  130. */
  131. public function setBoost($boost) {
  132. $boost = (float) $boost;
  133. if ($boost > 0.0) {
  134. $this->documentBoost = $boost;
  135. }
  136. else {
  137. $this->documentBoost = FALSE;
  138. }
  139. }
  140. /**
  141. * Adds a value to a multi-valued field
  142. *
  143. * NOTE: the solr XML format allows you to specify boosts PER value even
  144. * though the underlying Lucene implementation only allows a boost per field.
  145. * To remedy this, the final field boost value will be the product of all
  146. * specified boosts on field values - this is similar to SolrJ's
  147. * functionality.
  148. *
  149. * @code
  150. * $doc = new ApacheSolrDocument();
  151. * $doc->addField('foo', 'bar', 2.0);
  152. * $doc->addField('foo', 'baz', 3.0);
  153. * // Resultant field boost will be 6!
  154. * echo $doc->getFieldBoost('foo');
  155. * @endcode
  156. *
  157. * @param string $key
  158. * The name of the field.
  159. * @param $value
  160. * The value to add for the field.
  161. * @param float|false $boost
  162. * FALSE for default boost, or a positive number for setting a field boost.
  163. */
  164. public function addField($key, $value, $boost = FALSE) {
  165. if (!isset($this->fields[$key])) {
  166. // create holding array if this is the first value
  167. $this->fields[$key] = array();
  168. }
  169. else if (!is_array($this->fields[$key])) {
  170. // move existing value into array if it is not already an array
  171. $this->fields[$key] = array($this->fields[$key]);
  172. }
  173. if ($this->getFieldBoost($key) === FALSE) {
  174. // boost not already set, set it now
  175. $this->setFieldBoost($key, $boost);
  176. }
  177. else if ((float) $boost > 0.0) {
  178. // multiply passed boost with current field boost - similar to SolrJ implementation
  179. $this->fieldBoosts[$key] *= (float) $boost;
  180. }
  181. // add value to array
  182. $this->fields[$key][] = $value;
  183. }
  184. /**
  185. * Gets information about a field stored in Solr.
  186. *
  187. * @param string $key
  188. * The name of the field.
  189. *
  190. * @return array|false
  191. * An associative array of info if the field exists, FALSE otherwise.
  192. */
  193. public function getField($key) {
  194. if (isset($this->fields[$key])) {
  195. return array(
  196. 'name' => $key,
  197. 'value' => $this->fields[$key],
  198. 'boost' => $this->getFieldBoost($key)
  199. );
  200. }
  201. return FALSE;
  202. }
  203. /**
  204. * Sets a field value.
  205. *
  206. * Multi-valued fields should be set as arrays or via the addField()
  207. * function which will automatically make sure the field is an array.
  208. *
  209. * @param string $key
  210. * The name of the field.
  211. * @param string|array $value
  212. * The value to set for the field.
  213. * @param float|false $boost
  214. * FALSE for default boost, or a positive number for setting a field boost.
  215. */
  216. public function setField($key, $value, $boost = FALSE) {
  217. $this->fields[$key] = $value;
  218. $this->setFieldBoost($key, $boost);
  219. }
  220. /**
  221. * Gets the currently set field boost for a document field.
  222. *
  223. * @param string $key
  224. * The name of the field.
  225. *
  226. * @return float|false
  227. * The currently set field boost, or FALSE if none was set.
  228. */
  229. public function getFieldBoost($key) {
  230. return isset($this->fieldBoosts[$key]) ? $this->fieldBoosts[$key] : FALSE;
  231. }
  232. /**
  233. * Sets the field boost for a document field.
  234. *
  235. * @param string $key
  236. * The name of the field.
  237. * @param float|false $boost
  238. * FALSE for default boost, or a positive number for setting a field boost.
  239. */
  240. public function setFieldBoost($key, $boost) {
  241. $boost = (float) $boost;
  242. if ($boost > 0.0) {
  243. $this->fieldBoosts[$key] = $boost;
  244. }
  245. else {
  246. $this->fieldBoosts[$key] = FALSE;
  247. }
  248. }
  249. /**
  250. * Returns all current field boosts, indexed by field name.
  251. *
  252. * @return array
  253. * An associative array in the format $field_name => $field_boost.
  254. */
  255. public function getFieldBoosts() {
  256. return $this->fieldBoosts;
  257. }
  258. /**
  259. * Gets the currently set field's 'update' attribute for a document field.
  260. *
  261. * @param string $key
  262. * The name of the field.
  263. *
  264. * @return string|false
  265. * The currently set field's update attribute, or FALSE if none was set.
  266. */
  267. public function getFieldUpdate($key) {
  268. return isset($this->fieldUpdates[$key]) ? $this->fieldUpdates[$key] : FALSE;
  269. }
  270. /**
  271. * Sets the field's 'update' attribute for a document field.
  272. *
  273. * @param string $key
  274. * The name of the field.
  275. * @param string|false $update
  276. * One of the allowed update values ('add', 'set', 'inc').
  277. */
  278. public function setFieldUpdate($key, $update) {
  279. $this->fieldUpdates[$key] = $update;
  280. }
  281. /**
  282. * Retrieves all currently set field updates.
  283. *
  284. * @return string[]
  285. * Associative array of field's "update" attributes that were set, keyed by
  286. * field name.
  287. */
  288. public function getFieldUpdates() {
  289. return $this->fieldUpdates;
  290. }
  291. /**
  292. * Gets the names of all fields in this document.
  293. *
  294. * @return array
  295. * The names of all fields in this document.
  296. */
  297. public function getFieldNames() {
  298. return array_keys($this->fields);
  299. }
  300. /**
  301. * Gets the values of all fields in this document.
  302. *
  303. * @return array
  304. * The values of all fields in this document.
  305. */
  306. public function getFieldValues() {
  307. return array_values($this->fields);
  308. }
  309. /**
  310. * Retrieves the nested documents set on this document.
  311. *
  312. * @return \SearchApiSolrDocument[]
  313. * The nested documents.
  314. */
  315. public function getNestedObjects() {
  316. return $this->nestedObjects;
  317. }
  318. /**
  319. * Sets an array of nested documents.
  320. *
  321. * Populate nested documents for use with block join queries. Note that this
  322. * will lead to errors when used with Solr versions older than 4.5.
  323. *
  324. * @param SearchApiSolrDocument[] $nested_documents
  325. * An array of SearchApiSolrDocument objects.
  326. */
  327. public function setNestedDocuments(array $nested_documents) {
  328. $this->nestedObjects = $nested_documents;
  329. }
  330. /**
  331. * Implements IteratorAggregate::getIterator().
  332. *
  333. * Implementing the IteratorAggregate interface allows the following usage:
  334. * @code
  335. * foreach ($document as $key => $value) {
  336. * // ...
  337. * }
  338. * @endcode
  339. *
  340. * @return Traversable
  341. * An iterator over this document's fields.
  342. */
  343. public function getIterator() {
  344. $arrayObject = new ArrayObject($this->fields);
  345. return $arrayObject->getIterator();
  346. }
  347. /**
  348. * Magic getter for field values.
  349. *
  350. * @param string $key
  351. * The name of the field.
  352. *
  353. * @return string|array|null
  354. * The value that was set for the field.
  355. */
  356. public function __get($key) {
  357. return $this->fields[$key];
  358. }
  359. /**
  360. * Magic setter for field values.
  361. *
  362. * Multi-valued fields should be set as arrays or via the addField() function
  363. * which will automatically make sure the field is an array.
  364. *
  365. * @param string $key
  366. * The name of the field.
  367. * @param string|array $value
  368. * The value to set for the field.
  369. */
  370. public function __set($key, $value) {
  371. $this->setField($key, $value);
  372. }
  373. /**
  374. * Magic isset for fields values.
  375. *
  376. * Do not call directly. Allows the following usage:
  377. * @code
  378. * isset($document->some_field);
  379. * @endcode
  380. *
  381. * @param string $key
  382. * The name of the field.
  383. *
  384. * @return bool
  385. * Whether the given key is set in this document.
  386. */
  387. public function __isset($key) {
  388. return isset($this->fields[$key]);
  389. }
  390. /**
  391. * Magic unset for field values.
  392. *
  393. * Do not call directly. Allows the following usage:
  394. * @code
  395. * unset($document->some_field);
  396. * @endcode
  397. *
  398. * @param string $key
  399. * The name of the field.
  400. */
  401. public function __unset($key) {
  402. unset($this->fields[$key]);
  403. unset($this->fieldBoosts[$key]);
  404. }
  405. /**
  406. * Create an XML fragment from this document.
  407. *
  408. * This string can then be used inside a Solr add call.
  409. *
  410. * @return string
  411. * An XML formatted string for this document.
  412. */
  413. public function toXml() {
  414. $xml = '<doc';
  415. if ($this->documentBoost !== FALSE) {
  416. $xml .= ' boost="' . $this->documentBoost . '"';
  417. }
  418. $xml .= '>';
  419. foreach ($this->fields as $key => $values) {
  420. $fieldBoost = $this->getFieldBoost($key);
  421. $fieldUpdate = $this->getFieldUpdate($key);
  422. $key = htmlspecialchars($key, ENT_COMPAT, 'UTF-8');
  423. if (!is_array($values)) {
  424. $values = array($values);
  425. }
  426. foreach ($values as $value) {
  427. $xml .= '<field name="' . $key . '"';
  428. if ($fieldBoost !== FALSE) {
  429. $xml .= ' boost="' . $fieldBoost . '"';
  430. // Only set the boost for the first field in the set.
  431. $fieldBoost = FALSE;
  432. }
  433. if ($fieldUpdate !== FALSE) {
  434. $xml .= ' update="' . $fieldUpdate . '"';
  435. }
  436. $xml .= '>' . htmlspecialchars($value, ENT_NOQUOTES, 'UTF-8') . '</field>';
  437. }
  438. }
  439. // If nested objects have been added, include them in the XML to be indexed.
  440. foreach ($this->nestedObjects as $object) {
  441. // Skip any documents that aren't of the correct type.
  442. if (!($object instanceof SearchApiSolrDocument)) {
  443. $vars['@type'] = is_object($object) ? get_class($object) : gettype($object);
  444. watchdog('search_api_solr', 'Attempt to add an invalid nested Solr document of type @type.', $vars, WATCHDOG_ERROR);
  445. continue;
  446. }
  447. // Generate the markup for each nested document.
  448. $xml .= $object->toXml();
  449. }
  450. $xml .= '</doc>';
  451. // Remove any control characters to avoid Solr XML parser exception.
  452. return self::stripCtrlChars($xml);
  453. }
  454. /**
  455. * Sanitizes XML for sending to Solr.
  456. *
  457. * Replaces control (non-printable) characters that are invalid to Solr's XML
  458. * parser with a space.
  459. *
  460. * @param string $string
  461. * The string to sanitize.
  462. *
  463. * @return string
  464. * A string safe for including in a Solr request.
  465. */
  466. public static function stripCtrlChars($string) {
  467. // See: http://w3.org/International/questions/qa-forms-utf-8.html
  468. // Printable utf-8 does not include any of these chars below x7F
  469. return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $string);
  470. }
  471. }