Html.php 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. <?php
  2. namespace PicoFeed\Filter;
  3. use PicoFeed\Config\Config;
  4. use PicoFeed\Client\Url;
  5. use PicoFeed\Scraper\RuleLoader;
  6. use PicoFeed\Parser\XmlParser;
  7. /**
  8. * HTML Filter class.
  9. *
  10. * @author Frederic Guillot
  11. */
  12. class Html
  13. {
  14. /**
  15. * Config object.
  16. *
  17. * @var \PicoFeed\Config\Config
  18. */
  19. private $config;
  20. /**
  21. * Unfiltered XML data.
  22. *
  23. * @var string
  24. */
  25. private $input = '';
  26. /**
  27. * Filtered XML data.
  28. *
  29. * @var string
  30. */
  31. private $output = '';
  32. /**
  33. * List of empty tags.
  34. *
  35. * @var array
  36. */
  37. private $empty_tags = array();
  38. /**
  39. * Empty flag.
  40. *
  41. * @var bool
  42. */
  43. private $empty = true;
  44. /**
  45. * Tag instance.
  46. *
  47. * @var \PicoFeed\Filter\Tag
  48. */
  49. public $tag = '';
  50. /**
  51. * Attribute instance.
  52. *
  53. * @var \PicoFeed\Filter\Attribute
  54. */
  55. public $attribute = '';
  56. /**
  57. * The website to filter.
  58. *
  59. * @var string
  60. */
  61. private $website;
  62. /**
  63. * Initialize the filter, all inputs data must be encoded in UTF-8 before.
  64. *
  65. * @param string $html HTML content
  66. * @param string $website Site URL (used to build absolute URL)
  67. */
  68. public function __construct($html, $website)
  69. {
  70. $this->config = new Config();
  71. $this->input = XmlParser::htmlToXml($html);
  72. $this->output = '';
  73. $this->tag = new Tag($this->config);
  74. $this->website = $website;
  75. $this->attribute = new Attribute(new Url($website));
  76. }
  77. /**
  78. * Set config object.
  79. *
  80. * @param \PicoFeed\Config\Config $config Config instance
  81. * @return \PicoFeed\Filter\Html
  82. */
  83. public function setConfig($config)
  84. {
  85. $this->config = $config;
  86. if ($this->config !== null) {
  87. $this->attribute->setImageProxyCallback($this->config->getFilterImageProxyCallback());
  88. $this->attribute->setImageProxyUrl($this->config->getFilterImageProxyUrl());
  89. $this->attribute->setImageProxyProtocol($this->config->getFilterImageProxyProtocol());
  90. $this->attribute->setIframeWhitelist($this->config->getFilterIframeWhitelist(array()));
  91. $this->attribute->setIntegerAttributes($this->config->getFilterIntegerAttributes(array()));
  92. $this->attribute->setAttributeOverrides($this->config->getFilterAttributeOverrides(array()));
  93. $this->attribute->setRequiredAttributes($this->config->getFilterRequiredAttributes(array()));
  94. $this->attribute->setMediaBlacklist($this->config->getFilterMediaBlacklist(array()));
  95. $this->attribute->setMediaAttributes($this->config->getFilterMediaAttributes(array()));
  96. $this->attribute->setSchemeWhitelist($this->config->getFilterSchemeWhitelist(array()));
  97. $this->attribute->setWhitelistedAttributes($this->config->getFilterWhitelistedTags(array()));
  98. $this->tag->setWhitelistedTags(array_keys($this->config->getFilterWhitelistedTags(array())));
  99. }
  100. return $this;
  101. }
  102. /**
  103. * Run tags/attributes filtering.
  104. *
  105. * @return string
  106. */
  107. public function execute()
  108. {
  109. $this->preFilter();
  110. $parser = xml_parser_create();
  111. xml_set_object($parser, $this);
  112. xml_set_element_handler($parser, 'startTag', 'endTag');
  113. xml_set_character_data_handler($parser, 'dataTag');
  114. xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, false);
  115. xml_parse($parser, $this->input, true);
  116. xml_parser_free($parser);
  117. $this->postFilter();
  118. return $this->output;
  119. }
  120. /**
  121. * Called before XML parsing.
  122. */
  123. public function preFilter()
  124. {
  125. $this->input = $this->tag->removeBlacklistedTags($this->input);
  126. }
  127. /**
  128. * Called after XML parsing.
  129. */
  130. public function postFilter()
  131. {
  132. $this->output = $this->tag->removeEmptyTags($this->output);
  133. $this->output = $this->filterRules($this->output);
  134. $this->output = $this->tag->removeMultipleBreakTags($this->output);
  135. $this->output = trim($this->output);
  136. }
  137. /**
  138. * Called after XML parsing.
  139. *
  140. * @param string $content
  141. * @return string
  142. */
  143. public function filterRules($content)
  144. {
  145. // the constructor should require a config, then this if can be removed
  146. if ($this->config === null) {
  147. $config = new Config();
  148. } else {
  149. $config = $this->config;
  150. }
  151. $loader = new RuleLoader($config);
  152. $rules = $loader->getRules($this->website);
  153. $url = new Url($this->website);
  154. $sub_url = $url->getFullPath();
  155. if (isset($rules['filter'])) {
  156. foreach ($rules['filter'] as $pattern => $rule) {
  157. if (preg_match($pattern, $sub_url)) {
  158. foreach ($rule as $search => $replace) {
  159. $content = preg_replace($search, $replace, $content);
  160. }
  161. }
  162. }
  163. }
  164. return $content;
  165. }
  166. /**
  167. * Parse opening tag.
  168. *
  169. * @param resource $parser XML parser
  170. * @param string $tag Tag name
  171. * @param array $attributes Tag attributes
  172. */
  173. public function startTag($parser, $tag, array $attributes)
  174. {
  175. $this->empty = true;
  176. if ($this->tag->isAllowed($tag, $attributes)) {
  177. $attributes = $this->attribute->filter($tag, $attributes);
  178. if ($this->attribute->hasRequiredAttributes($tag, $attributes)) {
  179. $attributes = $this->attribute->addAttributes($tag, $attributes);
  180. $this->output .= $this->tag->openHtmlTag($tag, $this->attribute->toHtml($attributes));
  181. $this->empty = false;
  182. }
  183. }
  184. $this->empty_tags[] = $this->empty;
  185. }
  186. /**
  187. * Parse closing tag.
  188. *
  189. * @param resource $parser XML parser
  190. * @param string $tag Tag name
  191. */
  192. public function endTag($parser, $tag)
  193. {
  194. if (!array_pop($this->empty_tags) && $this->tag->isAllowedTag($tag)) {
  195. $this->output .= $this->tag->closeHtmlTag($tag);
  196. }
  197. }
  198. /**
  199. * Parse tag content.
  200. *
  201. * @param resource $parser XML parser
  202. * @param string $content Tag content
  203. */
  204. public function dataTag($parser, $content)
  205. {
  206. // Replace &nbsp; with normal space
  207. $content = str_replace("\xc2\xa0", ' ', $content);
  208. $this->output .= Filter::escape($content);
  209. }
  210. }