Crawler.php 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelector;
  12. /**
  13. * Crawler eases navigation of a list of \DOMElement objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. */
  17. class Crawler extends \SplObjectStorage
  18. {
  19. /**
  20. * @var string The current URI
  21. */
  22. protected $uri;
  23. /**
  24. * @var string The default namespace prefix to be used with XPath and CSS expressions
  25. */
  26. private $defaultNamespacePrefix = 'default';
  27. /**
  28. * @var array A map of manually registered namespaces
  29. */
  30. private $namespaces = array();
  31. /**
  32. * @var string The base href value
  33. */
  34. private $baseHref;
  35. /**
  36. * Constructor.
  37. *
  38. * @param mixed $node A Node to use as the base for the crawling
  39. * @param string $currentUri The current URI
  40. * @param string $baseHref The base href value
  41. */
  42. public function __construct($node = null, $currentUri = null, $baseHref = null)
  43. {
  44. $this->uri = $currentUri;
  45. $this->baseHref = $baseHref ?: $currentUri;
  46. $this->add($node);
  47. }
  48. /**
  49. * Removes all the nodes.
  50. */
  51. public function clear()
  52. {
  53. $this->removeAll($this);
  54. }
  55. /**
  56. * Adds a node to the current list of nodes.
  57. *
  58. * This method uses the appropriate specialized add*() method based
  59. * on the type of the argument.
  60. *
  61. * @param \DOMNodeList|\DOMNode|array|string|null $node A node
  62. *
  63. * @throws \InvalidArgumentException When node is not the expected type.
  64. */
  65. public function add($node)
  66. {
  67. if ($node instanceof \DOMNodeList) {
  68. $this->addNodeList($node);
  69. } elseif ($node instanceof \DOMNode) {
  70. $this->addNode($node);
  71. } elseif (is_array($node)) {
  72. $this->addNodes($node);
  73. } elseif (is_string($node)) {
  74. $this->addContent($node);
  75. } elseif (null !== $node) {
  76. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node)));
  77. }
  78. }
  79. /**
  80. * Adds HTML/XML content.
  81. *
  82. * If the charset is not set via the content type, it is assumed
  83. * to be ISO-8859-1, which is the default charset defined by the
  84. * HTTP 1.1 specification.
  85. *
  86. * @param string $content A string to parse as HTML/XML
  87. * @param null|string $type The content type of the string
  88. */
  89. public function addContent($content, $type = null)
  90. {
  91. if (empty($type)) {
  92. $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
  93. }
  94. // DOM only for HTML/XML content
  95. if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
  96. return;
  97. }
  98. $charset = null;
  99. if (false !== $pos = stripos($type, 'charset=')) {
  100. $charset = substr($type, $pos + 8);
  101. if (false !== $pos = strpos($charset, ';')) {
  102. $charset = substr($charset, 0, $pos);
  103. }
  104. }
  105. // http://www.w3.org/TR/encoding/#encodings
  106. // http://www.w3.org/TR/REC-xml/#NT-EncName
  107. if (null === $charset &&
  108. preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
  109. $charset = $matches[1];
  110. }
  111. if (null === $charset) {
  112. $charset = 'ISO-8859-1';
  113. }
  114. if ('x' === $xmlMatches[1]) {
  115. $this->addXmlContent($content, $charset);
  116. } else {
  117. $this->addHtmlContent($content, $charset);
  118. }
  119. }
  120. /**
  121. * Adds an HTML content to the list of nodes.
  122. *
  123. * The libxml errors are disabled when the content is parsed.
  124. *
  125. * If you want to get parsing errors, be sure to enable
  126. * internal errors via libxml_use_internal_errors(true)
  127. * and then, get the errors via libxml_get_errors(). Be
  128. * sure to clear errors with libxml_clear_errors() afterward.
  129. *
  130. * @param string $content The HTML content
  131. * @param string $charset The charset
  132. */
  133. public function addHtmlContent($content, $charset = 'UTF-8')
  134. {
  135. $internalErrors = libxml_use_internal_errors(true);
  136. $disableEntities = libxml_disable_entity_loader(true);
  137. $dom = new \DOMDocument('1.0', $charset);
  138. $dom->validateOnParse = true;
  139. set_error_handler(function () {throw new \Exception();});
  140. try {
  141. // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
  142. if (function_exists('mb_convert_encoding')) {
  143. $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
  144. } elseif (function_exists('iconv')) {
  145. $content = preg_replace_callback(
  146. '/[\x80-\xFF]+/',
  147. function ($m) {
  148. $m = unpack('C*', $m[0]);
  149. $i = 1;
  150. $entities = '';
  151. while (isset($m[$i])) {
  152. if (0xF0 <= $m[$i]) {
  153. $c = (($m[$i++] - 0xF0) << 18) + (($m[$i++] - 0x80) << 12) + (($m[$i++] - 0x80) << 6) + $m[$i++] - 0x80;
  154. } elseif (0xE0 <= $m[$i]) {
  155. $c = (($m[$i++] - 0xE0) << 12) + (($m[$i++] - 0x80) << 6) + $m[$i++] - 0x80;
  156. } else {
  157. $c = (($m[$i++] - 0xC0) << 6) + $m[$i++] - 0x80;
  158. }
  159. $entities .= '&#'.$c.';';
  160. }
  161. return $entities;
  162. },
  163. iconv($charset, 'UTF-8', $content)
  164. );
  165. }
  166. } catch (\Exception $e) {
  167. }
  168. restore_error_handler();
  169. if ('' !== trim($content)) {
  170. @$dom->loadHTML($content);
  171. }
  172. libxml_use_internal_errors($internalErrors);
  173. libxml_disable_entity_loader($disableEntities);
  174. $this->addDocument($dom);
  175. $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));
  176. $baseHref = current($base);
  177. if (count($base) && !empty($baseHref)) {
  178. if ($this->baseHref) {
  179. $linkNode = $dom->createElement('a');
  180. $linkNode->setAttribute('href', $baseHref);
  181. $link = new Link($linkNode, $this->baseHref);
  182. $this->baseHref = $link->getUri();
  183. } else {
  184. $this->baseHref = $baseHref;
  185. }
  186. }
  187. }
  188. /**
  189. * Adds an XML content to the list of nodes.
  190. *
  191. * The libxml errors are disabled when the content is parsed.
  192. *
  193. * If you want to get parsing errors, be sure to enable
  194. * internal errors via libxml_use_internal_errors(true)
  195. * and then, get the errors via libxml_get_errors(). Be
  196. * sure to clear errors with libxml_clear_errors() afterward.
  197. *
  198. * @param string $content The XML content
  199. * @param string $charset The charset
  200. */
  201. public function addXmlContent($content, $charset = 'UTF-8')
  202. {
  203. // remove the default namespace if it's the only namespace to make XPath expressions simpler
  204. if (!preg_match('/xmlns:/', $content)) {
  205. $content = str_replace('xmlns', 'ns', $content);
  206. }
  207. $internalErrors = libxml_use_internal_errors(true);
  208. $disableEntities = libxml_disable_entity_loader(true);
  209. $dom = new \DOMDocument('1.0', $charset);
  210. $dom->validateOnParse = true;
  211. if ('' !== trim($content)) {
  212. @$dom->loadXML($content, LIBXML_NONET);
  213. }
  214. libxml_use_internal_errors($internalErrors);
  215. libxml_disable_entity_loader($disableEntities);
  216. $this->addDocument($dom);
  217. }
  218. /**
  219. * Adds a \DOMDocument to the list of nodes.
  220. *
  221. * @param \DOMDocument $dom A \DOMDocument instance
  222. */
  223. public function addDocument(\DOMDocument $dom)
  224. {
  225. if ($dom->documentElement) {
  226. $this->addNode($dom->documentElement);
  227. }
  228. }
  229. /**
  230. * Adds a \DOMNodeList to the list of nodes.
  231. *
  232. * @param \DOMNodeList $nodes A \DOMNodeList instance
  233. */
  234. public function addNodeList(\DOMNodeList $nodes)
  235. {
  236. foreach ($nodes as $node) {
  237. if ($node instanceof \DOMNode) {
  238. $this->addNode($node);
  239. }
  240. }
  241. }
  242. /**
  243. * Adds an array of \DOMNode instances to the list of nodes.
  244. *
  245. * @param \DOMNode[] $nodes An array of \DOMNode instances
  246. */
  247. public function addNodes(array $nodes)
  248. {
  249. foreach ($nodes as $node) {
  250. $this->add($node);
  251. }
  252. }
  253. /**
  254. * Adds a \DOMNode instance to the list of nodes.
  255. *
  256. * @param \DOMNode $node A \DOMNode instance
  257. */
  258. public function addNode(\DOMNode $node)
  259. {
  260. if ($node instanceof \DOMDocument) {
  261. $this->attach($node->documentElement);
  262. } else {
  263. $this->attach($node);
  264. }
  265. }
  266. // Serializing and unserializing a crawler creates DOM objects in a corrupted state. DOM elements are not properly serializable.
  267. public function unserialize($serialized)
  268. {
  269. throw new \BadMethodCallException('A Crawler cannot be serialized.');
  270. }
  271. public function serialize()
  272. {
  273. throw new \BadMethodCallException('A Crawler cannot be serialized.');
  274. }
  275. /**
  276. * Returns a node given its position in the node list.
  277. *
  278. * @param int $position The position
  279. *
  280. * @return Crawler A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  281. */
  282. public function eq($position)
  283. {
  284. foreach ($this as $i => $node) {
  285. if ($i == $position) {
  286. return $this->createSubCrawler($node);
  287. }
  288. }
  289. return $this->createSubCrawler(null);
  290. }
  291. /**
  292. * Calls an anonymous function on each node of the list.
  293. *
  294. * The anonymous function receives the position and the node wrapped
  295. * in a Crawler instance as arguments.
  296. *
  297. * Example:
  298. *
  299. * $crawler->filter('h1')->each(function ($node, $i) {
  300. * return $node->text();
  301. * });
  302. *
  303. * @param \Closure $closure An anonymous function
  304. *
  305. * @return array An array of values returned by the anonymous function
  306. */
  307. public function each(\Closure $closure)
  308. {
  309. $data = array();
  310. foreach ($this as $i => $node) {
  311. $data[] = $closure($this->createSubCrawler($node), $i);
  312. }
  313. return $data;
  314. }
  315. /**
  316. * Slices the list of nodes by $offset and $length.
  317. *
  318. * @param int $offset
  319. * @param int $length
  320. *
  321. * @return Crawler A Crawler instance with the sliced nodes
  322. */
  323. public function slice($offset = 0, $length = -1)
  324. {
  325. return $this->createSubCrawler(iterator_to_array(new \LimitIterator($this, $offset, $length)));
  326. }
  327. /**
  328. * Reduces the list of nodes by calling an anonymous function.
  329. *
  330. * To remove a node from the list, the anonymous function must return false.
  331. *
  332. * @param \Closure $closure An anonymous function
  333. *
  334. * @return Crawler A Crawler instance with the selected nodes.
  335. */
  336. public function reduce(\Closure $closure)
  337. {
  338. $nodes = array();
  339. foreach ($this as $i => $node) {
  340. if (false !== $closure($this->createSubCrawler($node), $i)) {
  341. $nodes[] = $node;
  342. }
  343. }
  344. return $this->createSubCrawler($nodes);
  345. }
  346. /**
  347. * Returns the first node of the current selection.
  348. *
  349. * @return Crawler A Crawler instance with the first selected node
  350. */
  351. public function first()
  352. {
  353. return $this->eq(0);
  354. }
  355. /**
  356. * Returns the last node of the current selection.
  357. *
  358. * @return Crawler A Crawler instance with the last selected node
  359. */
  360. public function last()
  361. {
  362. return $this->eq(count($this) - 1);
  363. }
  364. /**
  365. * Returns the siblings nodes of the current selection.
  366. *
  367. * @return Crawler A Crawler instance with the sibling nodes
  368. *
  369. * @throws \InvalidArgumentException When current node is empty
  370. */
  371. public function siblings()
  372. {
  373. if (!count($this)) {
  374. throw new \InvalidArgumentException('The current node list is empty.');
  375. }
  376. return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
  377. }
  378. /**
  379. * Returns the next siblings nodes of the current selection.
  380. *
  381. * @return Crawler A Crawler instance with the next sibling nodes
  382. *
  383. * @throws \InvalidArgumentException When current node is empty
  384. */
  385. public function nextAll()
  386. {
  387. if (!count($this)) {
  388. throw new \InvalidArgumentException('The current node list is empty.');
  389. }
  390. return $this->createSubCrawler($this->sibling($this->getNode(0)));
  391. }
  392. /**
  393. * Returns the previous sibling nodes of the current selection.
  394. *
  395. * @return Crawler A Crawler instance with the previous sibling nodes
  396. *
  397. * @throws \InvalidArgumentException
  398. */
  399. public function previousAll()
  400. {
  401. if (!count($this)) {
  402. throw new \InvalidArgumentException('The current node list is empty.');
  403. }
  404. return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
  405. }
  406. /**
  407. * Returns the parents nodes of the current selection.
  408. *
  409. * @return Crawler A Crawler instance with the parents nodes of the current selection
  410. *
  411. * @throws \InvalidArgumentException When current node is empty
  412. */
  413. public function parents()
  414. {
  415. if (!count($this)) {
  416. throw new \InvalidArgumentException('The current node list is empty.');
  417. }
  418. $node = $this->getNode(0);
  419. $nodes = array();
  420. while ($node = $node->parentNode) {
  421. if (1 === $node->nodeType) {
  422. $nodes[] = $node;
  423. }
  424. }
  425. return $this->createSubCrawler($nodes);
  426. }
  427. /**
  428. * Returns the children nodes of the current selection.
  429. *
  430. * @return Crawler A Crawler instance with the children nodes
  431. *
  432. * @throws \InvalidArgumentException When current node is empty
  433. */
  434. public function children()
  435. {
  436. if (!count($this)) {
  437. throw new \InvalidArgumentException('The current node list is empty.');
  438. }
  439. $node = $this->getNode(0)->firstChild;
  440. return $this->createSubCrawler($node ? $this->sibling($node) : array());
  441. }
  442. /**
  443. * Returns the attribute value of the first node of the list.
  444. *
  445. * @param string $attribute The attribute name
  446. *
  447. * @return string|null The attribute value or null if the attribute does not exist
  448. *
  449. * @throws \InvalidArgumentException When current node is empty
  450. */
  451. public function attr($attribute)
  452. {
  453. if (!count($this)) {
  454. throw new \InvalidArgumentException('The current node list is empty.');
  455. }
  456. $node = $this->getNode(0);
  457. return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null;
  458. }
  459. /**
  460. * Returns the node name of the first node of the list.
  461. *
  462. * @return string The node name
  463. *
  464. * @throws \InvalidArgumentException When current node is empty
  465. */
  466. public function nodeName()
  467. {
  468. if (!count($this)) {
  469. throw new \InvalidArgumentException('The current node list is empty.');
  470. }
  471. return $this->getNode(0)->nodeName;
  472. }
  473. /**
  474. * Returns the node value of the first node of the list.
  475. *
  476. * @return string The node value
  477. *
  478. * @throws \InvalidArgumentException When current node is empty
  479. */
  480. public function text()
  481. {
  482. if (!count($this)) {
  483. throw new \InvalidArgumentException('The current node list is empty.');
  484. }
  485. return $this->getNode(0)->nodeValue;
  486. }
  487. /**
  488. * Returns the first node of the list as HTML.
  489. *
  490. * @return string The node html
  491. *
  492. * @throws \InvalidArgumentException When current node is empty
  493. */
  494. public function html()
  495. {
  496. if (!count($this)) {
  497. throw new \InvalidArgumentException('The current node list is empty.');
  498. }
  499. $html = '';
  500. foreach ($this->getNode(0)->childNodes as $child) {
  501. $html .= $child->ownerDocument->saveHTML($child);
  502. }
  503. return $html;
  504. }
  505. /**
  506. * Extracts information from the list of nodes.
  507. *
  508. * You can extract attributes or/and the node value (_text).
  509. *
  510. * Example:
  511. *
  512. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  513. *
  514. * @param array $attributes An array of attributes
  515. *
  516. * @return array An array of extracted values
  517. */
  518. public function extract($attributes)
  519. {
  520. $attributes = (array) $attributes;
  521. $count = count($attributes);
  522. $data = array();
  523. foreach ($this as $node) {
  524. $elements = array();
  525. foreach ($attributes as $attribute) {
  526. if ('_text' === $attribute) {
  527. $elements[] = $node->nodeValue;
  528. } else {
  529. $elements[] = $node->getAttribute($attribute);
  530. }
  531. }
  532. $data[] = $count > 1 ? $elements : $elements[0];
  533. }
  534. return $data;
  535. }
  536. /**
  537. * Filters the list of nodes with an XPath expression.
  538. *
  539. * The XPath expression is evaluated in the context of the crawler, which
  540. * is considered as a fake parent of the elements inside it.
  541. * This means that a child selector "div" or "./div" will match only
  542. * the div elements of the current crawler, not their children.
  543. *
  544. * @param string $xpath An XPath expression
  545. *
  546. * @return Crawler A new instance of Crawler with the filtered list of nodes
  547. */
  548. public function filterXPath($xpath)
  549. {
  550. $xpath = $this->relativize($xpath);
  551. // If we dropped all expressions in the XPath while preparing it, there would be no match
  552. if ('' === $xpath) {
  553. return $this->createSubCrawler(null);
  554. }
  555. return $this->filterRelativeXPath($xpath);
  556. }
  557. /**
  558. * Filters the list of nodes with a CSS selector.
  559. *
  560. * This method only works if you have installed the CssSelector Symfony Component.
  561. *
  562. * @param string $selector A CSS selector
  563. *
  564. * @return Crawler A new instance of Crawler with the filtered list of nodes
  565. *
  566. * @throws \RuntimeException if the CssSelector Component is not available
  567. */
  568. public function filter($selector)
  569. {
  570. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) {
  571. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  572. }
  573. // The CssSelector already prefixes the selector with descendant-or-self::
  574. return $this->filterRelativeXPath(CssSelector::toXPath($selector));
  575. }
  576. /**
  577. * Selects links by name or alt value for clickable images.
  578. *
  579. * @param string $value The link text
  580. *
  581. * @return Crawler A new instance of Crawler with the filtered list of nodes
  582. */
  583. public function selectLink($value)
  584. {
  585. $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
  586. sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));
  587. return $this->filterRelativeXPath($xpath);
  588. }
  589. /**
  590. * Selects a button by name or alt value for images.
  591. *
  592. * @param string $value The button text
  593. *
  594. * @return Crawler A new instance of Crawler with the filtered list of nodes
  595. */
  596. public function selectButton($value)
  597. {
  598. $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
  599. $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
  600. sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)).
  601. sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value));
  602. return $this->filterRelativeXPath($xpath);
  603. }
  604. /**
  605. * Returns a Link object for the first node in the list.
  606. *
  607. * @param string $method The method for the link (get by default)
  608. *
  609. * @return Link A Link instance
  610. *
  611. * @throws \InvalidArgumentException If the current node list is empty
  612. */
  613. public function link($method = 'get')
  614. {
  615. if (!count($this)) {
  616. throw new \InvalidArgumentException('The current node list is empty.');
  617. }
  618. $node = $this->getNode(0);
  619. return new Link($node, $this->baseHref, $method);
  620. }
  621. /**
  622. * Returns an array of Link objects for the nodes in the list.
  623. *
  624. * @return Link[] An array of Link instances
  625. */
  626. public function links()
  627. {
  628. $links = array();
  629. foreach ($this as $node) {
  630. $links[] = new Link($node, $this->baseHref, 'get');
  631. }
  632. return $links;
  633. }
  634. /**
  635. * Returns a Form object for the first node in the list.
  636. *
  637. * @param array $values An array of values for the form fields
  638. * @param string $method The method for the form
  639. *
  640. * @return Form A Form instance
  641. *
  642. * @throws \InvalidArgumentException If the current node list is empty
  643. */
  644. public function form(array $values = null, $method = null)
  645. {
  646. if (!count($this)) {
  647. throw new \InvalidArgumentException('The current node list is empty.');
  648. }
  649. $form = new Form($this->getNode(0), $this->uri, $method, $this->baseHref);
  650. if (null !== $values) {
  651. $form->setValues($values);
  652. }
  653. return $form;
  654. }
  655. /**
  656. * Overloads a default namespace prefix to be used with XPath and CSS expressions.
  657. *
  658. * @param string $prefix
  659. */
  660. public function setDefaultNamespacePrefix($prefix)
  661. {
  662. $this->defaultNamespacePrefix = $prefix;
  663. }
  664. /**
  665. * @param string $prefix
  666. * @param string $namespace
  667. */
  668. public function registerNamespace($prefix, $namespace)
  669. {
  670. $this->namespaces[$prefix] = $namespace;
  671. }
  672. /**
  673. * Converts string for XPath expressions.
  674. *
  675. * Escaped characters are: quotes (") and apostrophe (').
  676. *
  677. * Examples:
  678. * <code>
  679. * echo Crawler::xpathLiteral('foo " bar');
  680. * //prints 'foo " bar'
  681. *
  682. * echo Crawler::xpathLiteral("foo ' bar");
  683. * //prints "foo ' bar"
  684. *
  685. * echo Crawler::xpathLiteral('a\'b"c');
  686. * //prints concat('a', "'", 'b"c')
  687. * </code>
  688. *
  689. * @param string $s String to be escaped
  690. *
  691. * @return string Converted string
  692. */
  693. public static function xpathLiteral($s)
  694. {
  695. if (false === strpos($s, "'")) {
  696. return sprintf("'%s'", $s);
  697. }
  698. if (false === strpos($s, '"')) {
  699. return sprintf('"%s"', $s);
  700. }
  701. $string = $s;
  702. $parts = array();
  703. while (true) {
  704. if (false !== $pos = strpos($string, "'")) {
  705. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  706. $parts[] = "\"'\"";
  707. $string = substr($string, $pos + 1);
  708. } else {
  709. $parts[] = "'$string'";
  710. break;
  711. }
  712. }
  713. return sprintf('concat(%s)', implode($parts, ', '));
  714. }
  715. /**
  716. * Filters the list of nodes with an XPath expression.
  717. *
  718. * The XPath expression should already be processed to apply it in the context of each node.
  719. *
  720. * @param string $xpath
  721. *
  722. * @return Crawler
  723. */
  724. private function filterRelativeXPath($xpath)
  725. {
  726. $prefixes = $this->findNamespacePrefixes($xpath);
  727. $crawler = $this->createSubCrawler(null);
  728. foreach ($this as $node) {
  729. $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
  730. $crawler->add($domxpath->query($xpath, $node));
  731. }
  732. return $crawler;
  733. }
  734. /**
  735. * Make the XPath relative to the current context.
  736. *
  737. * The returned XPath will match elements matching the XPath inside the current crawler
  738. * when running in the context of a node of the crawler.
  739. *
  740. * @param string $xpath
  741. *
  742. * @return string
  743. */
  744. private function relativize($xpath)
  745. {
  746. $expressions = array();
  747. $unionPattern = '/\|(?![^\[]*\])/';
  748. // An expression which will never match to replace expressions which cannot match in the crawler
  749. // We cannot simply drop
  750. $nonMatchingExpression = 'a[name() = "b"]';
  751. // Split any unions into individual expressions.
  752. foreach (preg_split($unionPattern, $xpath) as $expression) {
  753. $expression = trim($expression);
  754. $parenthesis = '';
  755. // If the union is inside some braces, we need to preserve the opening braces and apply
  756. // the change only inside it.
  757. if (preg_match('/^[\(\s*]+/', $expression, $matches)) {
  758. $parenthesis = $matches[0];
  759. $expression = substr($expression, strlen($parenthesis));
  760. }
  761. // BC for Symfony 2.4 and lower were elements were adding in a fake _root parent
  762. if (0 === strpos($expression, '/_root/')) {
  763. $expression = './'.substr($expression, 7);
  764. } elseif (0 === strpos($expression, 'self::*/')) {
  765. $expression = './'.substr($expression, 8);
  766. }
  767. // add prefix before absolute element selector
  768. if (empty($expression)) {
  769. $expression = $nonMatchingExpression;
  770. } elseif (0 === strpos($expression, '//')) {
  771. $expression = 'descendant-or-self::'.substr($expression, 2);
  772. } elseif (0 === strpos($expression, './/')) {
  773. $expression = 'descendant-or-self::'.substr($expression, 3);
  774. } elseif (0 === strpos($expression, './')) {
  775. $expression = 'self::'.substr($expression, 2);
  776. } elseif (0 === strpos($expression, 'child::')) {
  777. $expression = 'self::'.substr($expression, 7);
  778. } elseif ('/' === $expression[0] || 0 === strpos($expression, 'self::')) {
  779. // the only direct child in Symfony 2.4 and lower is _root, which is already handled previously
  780. // so let's drop the expression entirely
  781. $expression = $nonMatchingExpression;
  782. } elseif ('.' === $expression[0]) {
  783. // '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results
  784. $expression = $nonMatchingExpression;
  785. } elseif (0 === strpos($expression, 'descendant::')) {
  786. $expression = 'descendant-or-self::'.substr($expression, strlen('descendant::'));
  787. } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
  788. // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
  789. $expression = $nonMatchingExpression;
  790. } elseif (0 !== strpos($expression, 'descendant-or-self::')) {
  791. $expression = 'self::'.$expression;
  792. }
  793. $expressions[] = $parenthesis.$expression;
  794. }
  795. return implode(' | ', $expressions);
  796. }
  797. /**
  798. * @param int $position
  799. *
  800. * @return \DOMElement|null
  801. */
  802. public function getNode($position)
  803. {
  804. foreach ($this as $i => $node) {
  805. if ($i == $position) {
  806. return $node;
  807. }
  808. }
  809. }
  810. /**
  811. * @param \DOMElement $node
  812. * @param string $siblingDir
  813. *
  814. * @return array
  815. */
  816. protected function sibling($node, $siblingDir = 'nextSibling')
  817. {
  818. $nodes = array();
  819. do {
  820. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  821. $nodes[] = $node;
  822. }
  823. } while ($node = $node->$siblingDir);
  824. return $nodes;
  825. }
  826. /**
  827. * @param \DOMDocument $document
  828. * @param array $prefixes
  829. *
  830. * @return \DOMXPath
  831. *
  832. * @throws \InvalidArgumentException
  833. */
  834. private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
  835. {
  836. $domxpath = new \DOMXPath($document);
  837. foreach ($prefixes as $prefix) {
  838. $namespace = $this->discoverNamespace($domxpath, $prefix);
  839. if (null !== $namespace) {
  840. $domxpath->registerNamespace($prefix, $namespace);
  841. }
  842. }
  843. return $domxpath;
  844. }
  845. /**
  846. * @param \DOMXPath $domxpath
  847. * @param string $prefix
  848. *
  849. * @return string
  850. *
  851. * @throws \InvalidArgumentException
  852. */
  853. private function discoverNamespace(\DOMXPath $domxpath, $prefix)
  854. {
  855. if (isset($this->namespaces[$prefix])) {
  856. return $this->namespaces[$prefix];
  857. }
  858. // ask for one namespace, otherwise we'd get a collection with an item for each node
  859. $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
  860. if ($node = $namespaces->item(0)) {
  861. return $node->nodeValue;
  862. }
  863. }
  864. /**
  865. * @param string $xpath
  866. *
  867. * @return array
  868. */
  869. private function findNamespacePrefixes($xpath)
  870. {
  871. if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
  872. return array_unique($matches['prefix']);
  873. }
  874. return array();
  875. }
  876. /**
  877. * Creates a crawler for some subnodes.
  878. *
  879. * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes
  880. *
  881. * @return static
  882. */
  883. private function createSubCrawler($nodes)
  884. {
  885. $crawler = new static($nodes, $this->uri, $this->baseHref);
  886. return $crawler;
  887. }
  888. }