Selector.php 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. <?php
  2. namespace PHPHtmlParser\Selector;
  3. use PHPHtmlParser\Dom\AbstractNode;
  4. use PHPHtmlParser\Dom\Collection;
  5. use PHPHtmlParser\Dom\InnerNode;
  6. use PHPHtmlParser\Dom\LeafNode;
  7. use PHPHtmlParser\Exceptions\ChildNotFoundException;
  8. /**
  9. * Class Selector
  10. *
  11. * @package PHPHtmlParser
  12. */
  13. class Selector
  14. {
  15. /**
  16. * @var array
  17. */
  18. protected $selectors = [];
  19. /**
  20. * Constructs with the selector string
  21. *
  22. * @param string $selector
  23. */
  24. public function __construct(string $selector, ParserInterface $parser)
  25. {
  26. $this->selectors = $parser->parseSelectorString($selector);
  27. }
  28. /**
  29. * Returns the selectors that where found in __construct
  30. *
  31. * @return array
  32. */
  33. public function getSelectors()
  34. {
  35. return $this->selectors;
  36. }
  37. /**
  38. * Attempts to find the selectors starting from the given
  39. * node object.
  40. *
  41. * @param AbstractNode $node
  42. * @return Collection
  43. */
  44. public function find(AbstractNode $node): Collection
  45. {
  46. $results = new Collection;
  47. foreach ($this->selectors as $selector) {
  48. $nodes = [$node];
  49. if (count($selector) == 0) {
  50. continue;
  51. }
  52. $options = [];
  53. foreach ($selector as $rule) {
  54. if ($rule['alterNext']) {
  55. $options[] = $this->alterNext($rule);
  56. continue;
  57. }
  58. $nodes = $this->seek($nodes, $rule, $options);
  59. // clear the options
  60. $options = [];
  61. }
  62. // this is the final set of nodes
  63. foreach ($nodes as $result) {
  64. $results[] = $result;
  65. }
  66. }
  67. return $results;
  68. }
  69. /**
  70. * Attempts to find all children that match the rule
  71. * given.
  72. *
  73. * @param array $nodes
  74. * @param array $rule
  75. * @param array $options
  76. * @return array
  77. * @recursive
  78. */
  79. protected function seek(array $nodes, array $rule, array $options): array
  80. {
  81. // XPath index
  82. if (array_key_exists('tag', $rule) &&
  83. array_key_exists('key', $rule) &&
  84. is_numeric($rule['key'])
  85. ) {
  86. $count = 0;
  87. /** @var AbstractNode $node */
  88. foreach ($nodes as $node) {
  89. if ($rule['tag'] == '*' ||
  90. $rule['tag'] == $node->getTag()->name()
  91. ) {
  92. ++$count;
  93. if ($count == $rule['key']) {
  94. // found the node we wanted
  95. return [$node];
  96. }
  97. }
  98. }
  99. return [];
  100. }
  101. $options = $this->flattenOptions($options);
  102. $return = [];
  103. /** @var InnerNode $node */
  104. foreach ($nodes as $node) {
  105. // check if we are a leaf
  106. if ($node instanceof LeafNode ||
  107. ! $node->hasChildren()
  108. ) {
  109. continue;
  110. }
  111. $children = [];
  112. $child = $node->firstChild();
  113. while ( ! is_null($child)) {
  114. // wild card, grab all
  115. if ($rule['tag'] == '*' && is_null($rule['key'])) {
  116. $return[] = $child;
  117. $child = $this->getNextChild($node, $child);
  118. continue;
  119. }
  120. $pass = $this->checkTag($rule, $child);
  121. if ($pass && ! is_null($rule['key'])) {
  122. $pass = $this->checkKey($rule, $child);
  123. }
  124. if ($pass && ! is_null($rule['key']) &&
  125. ! is_null($rule['value']) && $rule['value'] != '*'
  126. ) {
  127. $pass = $this->checkComparison($rule, $child);
  128. }
  129. if ($pass) {
  130. // it passed all checks
  131. $return[] = $child;
  132. } else {
  133. // this child failed to be matched
  134. if ($child instanceof InnerNode &&
  135. $child->hasChildren()
  136. ) {
  137. // we still want to check its children
  138. $children[] = $child;
  139. }
  140. }
  141. $child = $this->getNextChild($node, $child);
  142. }
  143. if (( ! isset($options['checkGrandChildren']) ||
  144. $options['checkGrandChildren'])
  145. && count($children) > 0
  146. ) {
  147. // we have children that failed but are not leaves.
  148. $matches = $this->seek($children, $rule, $options);
  149. foreach ($matches as $match) {
  150. $return[] = $match;
  151. }
  152. }
  153. }
  154. return $return;
  155. }
  156. /**
  157. * Attempts to match the given arguments with the given operator.
  158. *
  159. * @param string $operator
  160. * @param string $pattern
  161. * @param string $value
  162. * @return bool
  163. */
  164. protected function match(string $operator, string $pattern, string $value): bool
  165. {
  166. $value = strtolower($value);
  167. $pattern = strtolower($pattern);
  168. switch ($operator) {
  169. case '=':
  170. return $value === $pattern;
  171. case '!=':
  172. return $value !== $pattern;
  173. case '^=':
  174. return preg_match('/^'.preg_quote($pattern, '/').'/', $value) == 1;
  175. case '$=':
  176. return preg_match('/'.preg_quote($pattern, '/').'$/', $value) == 1;
  177. case '*=':
  178. if ($pattern[0] == '/') {
  179. return preg_match($pattern, $value) == 1;
  180. }
  181. return preg_match("/".$pattern."/i", $value) == 1;
  182. }
  183. return false;
  184. }
  185. /**
  186. * Attempts to figure out what the alteration will be for
  187. * the next element.
  188. *
  189. * @param array $rule
  190. * @return array
  191. */
  192. protected function alterNext(array $rule): array
  193. {
  194. $options = [];
  195. if ($rule['tag'] == '>') {
  196. $options['checkGrandChildren'] = false;
  197. }
  198. return $options;
  199. }
  200. /**
  201. * Flattens the option array.
  202. *
  203. * @param array $optionsArray
  204. * @return array
  205. */
  206. protected function flattenOptions(array $optionsArray)
  207. {
  208. $options = [];
  209. foreach ($optionsArray as $optionArray) {
  210. foreach ($optionArray as $key => $option) {
  211. $options[$key] = $option;
  212. }
  213. }
  214. return $options;
  215. }
  216. /**
  217. * Returns the next child or null if no more children.
  218. *
  219. * @param AbstractNode $node
  220. * @param AbstractNode $currentChild
  221. * @return AbstractNode|null
  222. */
  223. protected function getNextChild(AbstractNode $node, AbstractNode $currentChild)
  224. {
  225. try {
  226. // get next child
  227. $child = $node->nextChild($currentChild->id());
  228. } catch (ChildNotFoundException $e) {
  229. // no more children
  230. $child = null;
  231. }
  232. return $child;
  233. }
  234. /**
  235. * Checks tag condition from rules against node.
  236. *
  237. * @param array $rule
  238. * @param AbstractNode $node
  239. * @return bool
  240. */
  241. protected function checkTag(array $rule, AbstractNode $node): bool
  242. {
  243. if ( ! empty($rule['tag']) && $rule['tag'] != $node->getTag()->name() &&
  244. $rule['tag'] != '*'
  245. ) {
  246. return false;
  247. }
  248. return true;
  249. }
  250. /**
  251. * Checks key condition from rules against node.
  252. *
  253. * @param array $rule
  254. * @param AbstractNode $node
  255. * @return bool
  256. */
  257. protected function checkKey(array $rule, AbstractNode $node): bool
  258. {
  259. if ($rule['noKey']) {
  260. if ( ! is_null($node->getAttribute($rule['key']))) {
  261. return false;
  262. }
  263. } else {
  264. if ($rule['key'] != 'plaintext' && !$node->hasAttribute($rule['key'])) {
  265. return false;
  266. }
  267. }
  268. return true;
  269. }
  270. /**
  271. * Checks comparison condition from rules against node.
  272. *
  273. * @param array $rule
  274. * @param AbstractNode $node
  275. * @return bool
  276. */
  277. public function checkComparison(array $rule, AbstractNode $node): bool
  278. {
  279. if ($rule['key'] == 'plaintext') {
  280. // plaintext search
  281. $nodeValue = $node->text();
  282. } else {
  283. // normal search
  284. $nodeValue = $node->getAttribute($rule['key']);
  285. }
  286. $check = $this->match($rule['operator'], $rule['value'], $nodeValue);
  287. // handle multiple classes
  288. if ( ! $check && $rule['key'] == 'class') {
  289. $nodeClasses = explode(' ', $node->getAttribute('class'));
  290. foreach ($nodeClasses as $class) {
  291. if ( ! empty($class)) {
  292. $check = $this->match($rule['operator'], $rule['value'], $class);
  293. }
  294. if ($check) {
  295. break;
  296. }
  297. }
  298. }
  299. return $check;
  300. }
  301. }