rules = $rules; $this->dom = XmlParser::getHtmlDocument(''.$html); $this->xpath = new DOMXPath($this->dom); } /** * Get the relevant content with predefined rules. * * @return string */ public function execute() { $this->stripTags(); return $this->findContent(); } /** * Remove HTML tags. */ public function stripTags() { if (isset($this->rules['strip']) && is_array($this->rules['strip'])) { foreach ($this->rules['strip'] as $pattern) { $nodes = $this->xpath->query($pattern); if ($nodes !== false && $nodes->length > 0) { foreach ($nodes as $node) { $node->parentNode->removeChild($node); } } } } } /** * Fetch content based on Xpath rules. */ public function findContent() { $content = ''; if (isset($this->rules['body']) && is_array($this->rules['body'])) { foreach ($this->rules['body'] as $pattern) { $nodes = $this->xpath->query($pattern); if ($nodes !== false && $nodes->length > 0) { foreach ($nodes as $node) { $content .= $this->dom->saveXML($node); } } } } return $content; } /** * Fetch next link based on Xpath rules. * * @return string */ public function findNextLink() { if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) { foreach ($this->rules['next_page'] as $pattern) { $nodes = $this->xpath->query($pattern); if ($nodes !== false && $nodes->length > 0) { foreach ($nodes as $node) { return $node->getAttribute('href'); } } } } return null; } }