Dom.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782
  1. <?php
  2. namespace PHPHtmlParser;
  3. use PHPHtmlParser\Dom\AbstractNode;
  4. use PHPHtmlParser\Dom\HtmlNode;
  5. use PHPHtmlParser\Dom\TextNode;
  6. use PHPHtmlParser\Exceptions\NotLoadedException;
  7. use PHPHtmlParser\Exceptions\StrictException;
  8. use stringEncode\Encode;
  9. /**
  10. * Class Dom
  11. *
  12. * @package PHPHtmlParser
  13. */
  14. class Dom
  15. {
  16. /**
  17. * The charset we would like the output to be in.
  18. *
  19. * @var string
  20. */
  21. protected $defaultCharset = 'UTF-8';
  22. /**
  23. * Contains the root node of this dom tree.
  24. *
  25. * @var HtmlNode
  26. */
  27. public $root;
  28. /**
  29. * The raw version of the document string.
  30. *
  31. * @var string
  32. */
  33. protected $raw;
  34. /**
  35. * The document string.
  36. *
  37. * @var Content
  38. */
  39. protected $content = null;
  40. /**
  41. * The original file size of the document.
  42. *
  43. * @var int
  44. */
  45. protected $rawSize;
  46. /**
  47. * The size of the document after it is cleaned.
  48. *
  49. * @var int
  50. */
  51. protected $size;
  52. /**
  53. * A global options array to be used by all load calls.
  54. *
  55. * @var array
  56. */
  57. protected $globalOptions = [];
  58. /**
  59. * A persistent option object to be used for all options in the
  60. * parsing of the file.
  61. *
  62. * @var Options
  63. */
  64. protected $options;
  65. /**
  66. * A list of tags which will always be self closing
  67. *
  68. * @var array
  69. */
  70. protected $selfClosing = [
  71. 'area',
  72. 'base',
  73. 'basefont',
  74. 'br',
  75. 'col',
  76. 'embed',
  77. 'hr',
  78. 'img',
  79. 'input',
  80. 'keygen',
  81. 'link',
  82. 'meta',
  83. 'param',
  84. 'source',
  85. 'spacer',
  86. 'track',
  87. 'wbr'
  88. ];
  89. /**
  90. * A list of tags where there should be no /> at the end (html5 style)
  91. *
  92. * @var array
  93. */
  94. protected $noSlash = [];
  95. /**
  96. * Returns the inner html of the root node.
  97. *
  98. * @return string
  99. */
  100. public function __toString(): string
  101. {
  102. return $this->root->innerHtml();
  103. }
  104. /**
  105. * A simple wrapper around the root node.
  106. *
  107. * @param string $name
  108. * @return mixed
  109. */
  110. public function __get($name)
  111. {
  112. return $this->root->$name;
  113. }
  114. /**
  115. * Attempts to load the dom from any resource, string, file, or URL.
  116. *
  117. * @param string $str
  118. * @param array $options
  119. * @return Dom
  120. * @chainable
  121. */
  122. public function load(string $str, array $options = []): Dom
  123. {
  124. AbstractNode::resetCount();
  125. // check if it's a file
  126. if (strpos($str, "\n") === false && is_file($str)) {
  127. return $this->loadFromFile($str, $options);
  128. }
  129. // check if it's a url
  130. if (preg_match("/^https?:\/\//i", $str)) {
  131. return $this->loadFromUrl($str, $options);
  132. }
  133. return $this->loadStr($str, $options);
  134. }
  135. /**
  136. * Loads the dom from a document file/url
  137. *
  138. * @param string $file
  139. * @param array $options
  140. * @return Dom
  141. * @chainable
  142. */
  143. public function loadFromFile(string $file, array $options = []): Dom
  144. {
  145. return $this->loadStr(file_get_contents($file), $options);
  146. }
  147. /**
  148. * Use a curl interface implementation to attempt to load
  149. * the content from a url.
  150. *
  151. * @param string $url
  152. * @param array $options
  153. * @param CurlInterface $curl
  154. * @return Dom
  155. * @chainable
  156. */
  157. public function loadFromUrl(string $url, array $options = [], CurlInterface $curl = null): Dom
  158. {
  159. if (is_null($curl)) {
  160. // use the default curl interface
  161. $curl = new Curl;
  162. }
  163. $content = $curl->get($url);
  164. return $this->loadStr($content, $options);
  165. }
  166. /**
  167. * Parsers the html of the given string. Used for load(), loadFromFile(),
  168. * and loadFromUrl().
  169. *
  170. * @param string $str
  171. * @param array $option
  172. * @return Dom
  173. * @chainable
  174. */
  175. public function loadStr(string $str, array $option = []): Dom
  176. {
  177. $this->options = new Options;
  178. $this->options->setOptions($this->globalOptions)
  179. ->setOptions($option);
  180. $this->rawSize = strlen($str);
  181. $this->raw = $str;
  182. $html = $this->clean($str);
  183. $this->size = strlen($str);
  184. $this->content = new Content($html);
  185. $this->parse();
  186. $this->detectCharset();
  187. return $this;
  188. }
  189. /**
  190. * Sets a global options array to be used by all load calls.
  191. *
  192. * @param array $options
  193. * @return Dom
  194. * @chainable
  195. */
  196. public function setOptions(array $options): Dom
  197. {
  198. $this->globalOptions = $options;
  199. return $this;
  200. }
  201. /**
  202. * Find elements by css selector on the root node.
  203. *
  204. * @param string $selector
  205. * @param int $nth
  206. * @return mixed
  207. */
  208. public function find(string $selector, int $nth = null)
  209. {
  210. $this->isLoaded();
  211. return $this->root->find($selector, $nth);
  212. }
  213. /**
  214. * Find element by Id on the root node
  215. *
  216. * @param int $id
  217. * @return mixed
  218. */
  219. public function findById(int $id)
  220. {
  221. $this->isLoaded();
  222. return $this->root->findById($id);
  223. }
  224. /**
  225. * Adds the tag (or tags in an array) to the list of tags that will always
  226. * be self closing.
  227. *
  228. * @param string|array $tag
  229. * @return Dom
  230. * @chainable
  231. */
  232. public function addSelfClosingTag($tag): Dom
  233. {
  234. if ( ! is_array($tag)) {
  235. $tag = [$tag];
  236. }
  237. foreach ($tag as $value) {
  238. $this->selfClosing[] = $value;
  239. }
  240. return $this;
  241. }
  242. /**
  243. * Removes the tag (or tags in an array) from the list of tags that will
  244. * always be self closing.
  245. *
  246. * @param string|array $tag
  247. * @return Dom
  248. * @chainable
  249. */
  250. public function removeSelfClosingTag($tag): Dom
  251. {
  252. if ( ! is_array($tag)) {
  253. $tag = [$tag];
  254. }
  255. $this->selfClosing = array_diff($this->selfClosing, $tag);
  256. return $this;
  257. }
  258. /**
  259. * Sets the list of self closing tags to empty.
  260. *
  261. * @return Dom
  262. * @chainable
  263. */
  264. public function clearSelfClosingTags(): Dom
  265. {
  266. $this->selfClosing = [];
  267. return $this;
  268. }
  269. /**
  270. * Adds a tag to the list of self closing tags that should not have a trailing slash
  271. *
  272. * @param $tag
  273. * @return Dom
  274. * @chainable
  275. */
  276. public function addNoSlashTag($tag): Dom
  277. {
  278. if ( ! is_array($tag)) {
  279. $tag = [$tag];
  280. }
  281. foreach ($tag as $value) {
  282. $this->noSlash[] = $value;
  283. }
  284. return $this;
  285. }
  286. /**
  287. * Removes a tag from the list of no-slash tags.
  288. *
  289. * @param $tag
  290. * @return Dom
  291. * @chainable
  292. */
  293. public function removeNoSlashTag($tag): Dom
  294. {
  295. if ( ! is_array($tag)) {
  296. $tag = [$tag];
  297. }
  298. $this->noSlash = array_diff($this->noSlash, $tag);
  299. return $this;
  300. }
  301. /**
  302. * Empties the list of no-slash tags.
  303. *
  304. * @return Dom
  305. * @chainable
  306. */
  307. public function clearNoSlashTags(): Dom
  308. {
  309. $this->noSlash = [];
  310. return $this;
  311. }
  312. /**
  313. * Simple wrapper function that returns the first child.
  314. *
  315. * @return \PHPHtmlParser\Dom\AbstractNode
  316. */
  317. public function firstChild(): \PHPHtmlParser\Dom\AbstractNode
  318. {
  319. $this->isLoaded();
  320. return $this->root->firstChild();
  321. }
  322. /**
  323. * Simple wrapper function that returns the last child.
  324. *
  325. * @return \PHPHtmlParser\Dom\AbstractNode
  326. */
  327. public function lastChild(): \PHPHtmlParser\Dom\AbstractNode
  328. {
  329. $this->isLoaded();
  330. return $this->root->lastChild();
  331. }
  332. /**
  333. * Simple wrapper function that returns count of child elements
  334. *
  335. * @return int
  336. */
  337. public function countChildren(): int
  338. {
  339. $this->isLoaded();
  340. return $this->root->countChildren();
  341. }
  342. /**
  343. * Get array of children
  344. *
  345. * @return array
  346. */
  347. public function getChildren(): array
  348. {
  349. $this->isLoaded();
  350. return $this->root->getChildren();
  351. }
  352. /**
  353. * Check if node have children nodes
  354. *
  355. * @return bool
  356. */
  357. public function hasChildren(): bool
  358. {
  359. $this->isLoaded();
  360. return $this->root->hasChildren();
  361. }
  362. /**
  363. * Simple wrapper function that returns an element by the
  364. * id.
  365. *
  366. * @param string $id
  367. * @return \PHPHtmlParser\Dom\AbstractNode|null
  368. */
  369. public function getElementById($id)
  370. {
  371. $this->isLoaded();
  372. return $this->find('#'.$id, 0);
  373. }
  374. /**
  375. * Simple wrapper function that returns all elements by
  376. * tag name.
  377. *
  378. * @param string $name
  379. * @return mixed
  380. */
  381. public function getElementsByTag(string $name)
  382. {
  383. $this->isLoaded();
  384. return $this->find($name);
  385. }
  386. /**
  387. * Simple wrapper function that returns all elements by
  388. * class name.
  389. *
  390. * @param string $class
  391. * @return mixed
  392. */
  393. public function getElementsByClass(string $class)
  394. {
  395. $this->isLoaded();
  396. return $this->find('.'.$class);
  397. }
  398. /**
  399. * Checks if the load methods have been called.
  400. *
  401. * @throws NotLoadedException
  402. */
  403. protected function isLoaded(): void
  404. {
  405. if (is_null($this->content)) {
  406. throw new NotLoadedException('Content is not loaded!');
  407. }
  408. }
  409. /**
  410. * Cleans the html of any none-html information.
  411. *
  412. * @param string $str
  413. * @return string
  414. */
  415. protected function clean(string $str): string
  416. {
  417. if ($this->options->get('cleanupInput') != true) {
  418. // skip entire cleanup step
  419. return $str;
  420. }
  421. // remove white space before closing tags
  422. $str = mb_eregi_replace("'\s+>", "'>", $str);
  423. $str = mb_eregi_replace('"\s+>', '">', $str);
  424. // clean out the \n\r
  425. $replace = ' ';
  426. if ($this->options->get('preserveLineBreaks')) {
  427. $replace = '&#10;';
  428. }
  429. $str = str_replace(["\r\n", "\r", "\n"], $replace, $str);
  430. // strip the doctype
  431. $str = mb_eregi_replace("<!doctype(.*?)>", '', $str);
  432. // strip out comments
  433. $str = mb_eregi_replace("<!--(.*?)-->", '', $str);
  434. // strip out cdata
  435. $str = mb_eregi_replace("<!\[CDATA\[(.*?)\]\]>", '', $str);
  436. // strip out <script> tags
  437. if ($this->options->get('removeScripts') == true) {
  438. $str = mb_eregi_replace("<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>", '', $str);
  439. $str = mb_eregi_replace("<\s*script\s*>(.*?)<\s*/\s*script\s*>", '', $str);
  440. }
  441. // strip out <style> tags
  442. if ($this->options->get('removeStyles') == true) {
  443. $str = mb_eregi_replace("<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>", '', $str);
  444. $str = mb_eregi_replace("<\s*style\s*>(.*?)<\s*/\s*style\s*>", '', $str);
  445. }
  446. // strip out server side scripts
  447. if ($this->options->get('serverSideScriptis') == true){
  448. $str = mb_eregi_replace("(<\?)(.*?)(\?>)", '', $str);
  449. }
  450. // strip smarty scripts
  451. $str = mb_eregi_replace("(\{\w)(.*?)(\})", '', $str);
  452. return $str;
  453. }
  454. /**
  455. * Attempts to parse the html in content.
  456. */
  457. protected function parse(): void
  458. {
  459. // add the root node
  460. $this->root = new HtmlNode('root');
  461. $activeNode = $this->root;
  462. while ( ! is_null($activeNode)) {
  463. $str = $this->content->copyUntil('<');
  464. if ($str == '') {
  465. $info = $this->parseTag();
  466. if ( ! $info['status']) {
  467. // we are done here
  468. $activeNode = null;
  469. continue;
  470. }
  471. // check if it was a closing tag
  472. if ($info['closing']) {
  473. $foundOpeningTag = true;
  474. $originalNode = $activeNode;
  475. while ($activeNode->getTag()->name() != $info['tag']) {
  476. $activeNode = $activeNode->getParent();
  477. if (is_null($activeNode)) {
  478. // we could not find opening tag
  479. $activeNode = $originalNode;
  480. $foundOpeningTag = false;
  481. break;
  482. }
  483. }
  484. if ($foundOpeningTag) {
  485. $activeNode = $activeNode->getParent();
  486. }
  487. continue;
  488. }
  489. if ( ! isset($info['node'])) {
  490. continue;
  491. }
  492. /** @var AbstractNode $node */
  493. $node = $info['node'];
  494. $activeNode->addChild($node);
  495. // check if node is self closing
  496. if ( ! $node->getTag()->isSelfClosing()) {
  497. $activeNode = $node;
  498. }
  499. } else if ($this->options->whitespaceTextNode ||
  500. trim($str) != ''
  501. ) {
  502. // we found text we care about
  503. $textNode = new TextNode($str, $this->options->removeDoubleSpace);
  504. $activeNode->addChild($textNode);
  505. }
  506. }
  507. }
  508. /**
  509. * Attempt to parse a tag out of the content.
  510. *
  511. * @return array
  512. * @throws StrictException
  513. */
  514. protected function parseTag(): array
  515. {
  516. $return = [
  517. 'status' => false,
  518. 'closing' => false,
  519. 'node' => null,
  520. ];
  521. if ($this->content->char() != '<') {
  522. // we are not at the beginning of a tag
  523. return $return;
  524. }
  525. // check if this is a closing tag
  526. if ($this->content->fastForward(1)->char() == '/') {
  527. // end tag
  528. $tag = $this->content->fastForward(1)
  529. ->copyByToken('slash', true);
  530. // move to end of tag
  531. $this->content->copyUntil('>');
  532. $this->content->fastForward(1);
  533. // check if this closing tag counts
  534. $tag = strtolower($tag);
  535. if (in_array($tag, $this->selfClosing)) {
  536. $return['status'] = true;
  537. return $return;
  538. } else {
  539. $return['status'] = true;
  540. $return['closing'] = true;
  541. $return['tag'] = strtolower($tag);
  542. }
  543. return $return;
  544. }
  545. $tag = strtolower($this->content->copyByToken('slash', true));
  546. $node = new HtmlNode($tag);
  547. // attributes
  548. while ($this->content->char() != '>' &&
  549. $this->content->char() != '/') {
  550. $space = $this->content->skipByToken('blank', true);
  551. if (empty($space)) {
  552. $this->content->fastForward(1);
  553. continue;
  554. }
  555. $name = $this->content->copyByToken('equal', true);
  556. if ($name == '/') {
  557. break;
  558. }
  559. if (empty($name)) {
  560. $this->content->skipByToken('blank');
  561. continue;
  562. }
  563. $this->content->skipByToken('blank');
  564. if ($this->content->char() == '=') {
  565. $attr = [];
  566. $this->content->fastForward(1)
  567. ->skipByToken('blank');
  568. switch ($this->content->char()) {
  569. case '"':
  570. $attr['doubleQuote'] = true;
  571. $this->content->fastForward(1);
  572. $string = $this->content->copyUntil('"', true, true);
  573. do {
  574. $moreString = $this->content->copyUntilUnless('"', '=>');
  575. $string .= $moreString;
  576. } while ( ! empty($moreString));
  577. $attr['value'] = $string;
  578. $this->content->fastForward(1);
  579. $node->getTag()->$name = $attr;
  580. break;
  581. case "'":
  582. $attr['doubleQuote'] = false;
  583. $this->content->fastForward(1);
  584. $string = $this->content->copyUntil("'", true, true);
  585. do {
  586. $moreString = $this->content->copyUntilUnless("'", '=>');
  587. $string .= $moreString;
  588. } while ( ! empty($moreString));
  589. $attr['value'] = $string;
  590. $this->content->fastForward(1);
  591. $node->getTag()->$name = $attr;
  592. break;
  593. default:
  594. $attr['doubleQuote'] = true;
  595. $attr['value'] = $this->content->copyByToken('attr', true);
  596. $node->getTag()->$name = $attr;
  597. break;
  598. }
  599. } else {
  600. // no value attribute
  601. if ($this->options->strict) {
  602. // can't have this in strict html
  603. $character = $this->content->getPosition();
  604. throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
  605. }
  606. $node->getTag()->$name = [
  607. 'value' => null,
  608. 'doubleQuote' => true,
  609. ];
  610. if ($this->content->char() != '>') {
  611. $this->content->rewind(1);
  612. }
  613. }
  614. }
  615. $this->content->skipByToken('blank');
  616. if ($this->content->char() == '/') {
  617. // self closing tag
  618. $node->getTag()->selfClosing();
  619. $this->content->fastForward(1);
  620. } elseif (in_array($tag, $this->selfClosing)) {
  621. // Should be a self closing tag, check if we are strict
  622. if ($this->options->strict) {
  623. $character = $this->content->getPosition();
  624. throw new StrictException("Tag '$tag' is not self closing! (character #$character)");
  625. }
  626. // We force self closing on this tag.
  627. $node->getTag()->selfClosing();
  628. // Should this tag use a trailing slash?
  629. if(in_array($tag, $this->noSlash))
  630. {
  631. $node->getTag()->noTrailingSlash();
  632. }
  633. }
  634. $this->content->fastForward(1);
  635. $return['status'] = true;
  636. $return['node'] = $node;
  637. return $return;
  638. }
  639. /**
  640. * Attempts to detect the charset that the html was sent in.
  641. *
  642. * @return bool
  643. */
  644. protected function detectCharset(): bool
  645. {
  646. // set the default
  647. $encode = new Encode;
  648. $encode->from($this->defaultCharset);
  649. $encode->to($this->defaultCharset);
  650. if ( ! is_null($this->options->enforceEncoding)) {
  651. // they want to enforce the given encoding
  652. $encode->from($this->options->enforceEncoding);
  653. $encode->to($this->options->enforceEncoding);
  654. return false;
  655. }
  656. $meta = $this->root->find('meta[http-equiv=Content-Type]', 0);
  657. if (is_null($meta)) {
  658. // could not find meta tag
  659. $this->root->propagateEncoding($encode);
  660. return false;
  661. }
  662. $content = $meta->content;
  663. if (empty($content)) {
  664. // could not find content
  665. $this->root->propagateEncoding($encode);
  666. return false;
  667. }
  668. $matches = [];
  669. if (preg_match('/charset=(.+)/', $content, $matches)) {
  670. $encode->from(trim($matches[1]));
  671. $this->root->propagateEncoding($encode);
  672. return true;
  673. }
  674. // no charset found
  675. $this->root->propagateEncoding($encode);
  676. return false;
  677. }
  678. }