123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243 |
- <?php
- namespace PicoFeed\Filter;
- use PicoFeed\Config\Config;
- use PicoFeed\Client\Url;
- use PicoFeed\Scraper\RuleLoader;
- use PicoFeed\Parser\XmlParser;
- /**
- * HTML Filter class.
- *
- * @author Frederic Guillot
- */
- class Html
- {
- /**
- * Config object.
- *
- * @var \PicoFeed\Config\Config
- */
- private $config;
- /**
- * Unfiltered XML data.
- *
- * @var string
- */
- private $input = '';
- /**
- * Filtered XML data.
- *
- * @var string
- */
- private $output = '';
- /**
- * List of empty tags.
- *
- * @var array
- */
- private $empty_tags = array();
- /**
- * Empty flag.
- *
- * @var bool
- */
- private $empty = true;
- /**
- * Tag instance.
- *
- * @var \PicoFeed\Filter\Tag
- */
- public $tag = '';
- /**
- * Attribute instance.
- *
- * @var \PicoFeed\Filter\Attribute
- */
- public $attribute = '';
- /**
- * The website to filter.
- *
- * @var string
- */
- private $website;
- /**
- * Initialize the filter, all inputs data must be encoded in UTF-8 before.
- *
- * @param string $html HTML content
- * @param string $website Site URL (used to build absolute URL)
- */
- public function __construct($html, $website)
- {
- $this->config = new Config();
- $this->input = XmlParser::htmlToXml($html);
- $this->output = '';
- $this->tag = new Tag($this->config);
- $this->website = $website;
- $this->attribute = new Attribute(new Url($website));
- }
- /**
- * Set config object.
- *
- * @param \PicoFeed\Config\Config $config Config instance
- * @return \PicoFeed\Filter\Html
- */
- public function setConfig($config)
- {
- $this->config = $config;
- if ($this->config !== null) {
- $this->attribute->setImageProxyCallback($this->config->getFilterImageProxyCallback());
- $this->attribute->setImageProxyUrl($this->config->getFilterImageProxyUrl());
- $this->attribute->setImageProxyProtocol($this->config->getFilterImageProxyProtocol());
- $this->attribute->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
- $this->attribute->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
- $this->attribute->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
- $this->attribute->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
- $this->attribute->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
- $this->attribute->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
- $this->attribute->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
- $this->attribute->setWhitelistedAttributes($this->config->getFilterWhitelistedTags(array()));
- $this->tag->setWhitelistedTags(array_keys($this->config->getFilterWhitelistedTags(array())));
- }
- return $this;
- }
- /**
- * Run tags/attributes filtering.
- *
- * @return string
- */
- public function execute()
- {
- $this->preFilter();
- $parser = xml_parser_create();
- xml_set_object($parser, $this);
- xml_set_element_handler($parser, 'startTag', 'endTag');
- xml_set_character_data_handler($parser, 'dataTag');
- xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
- xml_parse($parser, $this->input, true);
- xml_parser_free($parser);
- $this->postFilter();
- return $this->output;
- }
- /**
- * Called before XML parsing.
- */
- public function preFilter()
- {
- $this->input = $this->tag->removeBlacklistedTags($this->input);
- }
- /**
- * Called after XML parsing.
- */
- public function postFilter()
- {
- $this->output = $this->tag->removeEmptyTags($this->output);
- $this->output = $this->filterRules($this->output);
- $this->output = $this->tag->removeMultipleBreakTags($this->output);
- $this->output = trim($this->output);
- }
- /**
- * Called after XML parsing.
- *
- * @param string $content
- * @return string
- */
- public function filterRules($content)
- {
- // the constructor should require a config, then this if can be removed
- if ($this->config === null) {
- $config = new Config();
- } else {
- $config = $this->config;
- }
- $loader = new RuleLoader($config);
- $rules = $loader->getRules($this->website);
- $url = new Url($this->website);
- $sub_url = $url->getFullPath();
- if (isset($rules['filter'])) {
- foreach ($rules['filter'] as $pattern => $rule) {
- if (preg_match($pattern, $sub_url)) {
- foreach ($rule as $search => $replace) {
- $content = preg_replace($search, $replace, $content);
- }
- }
- }
- }
- return $content;
- }
- /**
- * Parse opening tag.
- *
- * @param resource $parser XML parser
- * @param string $tag Tag name
- * @param array $attributes Tag attributes
- */
- public function startTag($parser, $tag, array $attributes)
- {
- $this->empty = true;
- if ($this->tag->isAllowed($tag, $attributes)) {
- $attributes = $this->attribute->filter($tag, $attributes);
- if ($this->attribute->hasRequiredAttributes($tag, $attributes)) {
- $attributes = $this->attribute->addAttributes($tag, $attributes);
- $this->output .= $this->tag->openHtmlTag($tag, $this->attribute->toHtml($attributes));
- $this->empty = false;
- }
- }
- $this->empty_tags[] = $this->empty;
- }
- /**
- * Parse closing tag.
- *
- * @param resource $parser XML parser
- * @param string $tag Tag name
- */
- public function endTag($parser, $tag)
- {
- if (!array_pop($this->empty_tags) && $this->tag->isAllowedTag($tag)) {
- $this->output .= $this->tag->closeHtmlTag($tag);
- }
- }
- /**
- * Parse tag content.
- *
- * @param resource $parser XML parser
- * @param string $content Tag content
- */
- public function dataTag($parser, $content)
- {
- // Replace with normal space
- $content = str_replace("\xc2\xa0", ' ', $content);
- $this->output .= Filter::escape($content);
- }
- }
|