Filter.php 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. <?php
  2. namespace PicoFeed\Filter;
  3. /**
  4. * Filter class.
  5. *
  6. * @author Frederic Guillot
  7. */
  8. class Filter
  9. {
  10. /**
  11. * Get the Html filter instance.
  12. *
  13. * @static
  14. * @param string $html HTML content
  15. * @param string $website Site URL (used to build absolute URL)
  16. * @return Html
  17. */
  18. public static function html($html, $website)
  19. {
  20. $filter = new Html($html, $website);
  21. return $filter;
  22. }
  23. /**
  24. * Escape HTML content.
  25. *
  26. * @static
  27. * @param string $content
  28. * @return string
  29. */
  30. public static function escape($content)
  31. {
  32. return htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
  33. }
  34. /**
  35. * Remove HTML tags.
  36. *
  37. * @param string $data Input data
  38. * @return string
  39. */
  40. public function removeHTMLTags($data)
  41. {
  42. return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
  43. }
  44. /**
  45. * Remove the XML tag from a document.
  46. *
  47. * @static
  48. * @param string $data Input data
  49. * @return string
  50. */
  51. public static function stripXmlTag($data)
  52. {
  53. if (strpos($data, '<?xml') !== false) {
  54. $data = ltrim(substr($data, strpos($data, '?>') + 2));
  55. }
  56. do {
  57. $pos = strpos($data, '<?xml-stylesheet ');
  58. if ($pos !== false) {
  59. $data = ltrim(substr($data, strpos($data, '?>') + 2));
  60. }
  61. } while ($pos !== false && $pos < 200);
  62. return $data;
  63. }
  64. /**
  65. * Strip head tag from the HTML content.
  66. *
  67. * @static
  68. * @param string $data Input data
  69. * @return string
  70. */
  71. public static function stripHeadTags($data)
  72. {
  73. return preg_replace('@<head[^>]*?>.*?</head>@siu', '', $data);
  74. }
  75. /**
  76. * Trim whitespace from the begining, the end and inside a string and don't break utf-8 string.
  77. *
  78. * @static
  79. * @param string $value Raw data
  80. * @return string Normalized data
  81. */
  82. public static function stripWhiteSpace($value)
  83. {
  84. $value = str_replace("\r", ' ', $value);
  85. $value = str_replace("\t", ' ', $value);
  86. $value = str_replace("\n", ' ', $value);
  87. // $value = preg_replace('/\s+/', ' ', $value); <= break utf-8
  88. return trim($value);
  89. }
  90. /**
  91. * Fixes before XML parsing.
  92. *
  93. * @static
  94. * @param string $data Raw data
  95. * @return string Normalized data
  96. */
  97. public static function normalizeData($data)
  98. {
  99. $entities = array(
  100. '/(&#)(\d+);/m', // decimal encoded
  101. '/(&#x)([a-f0-9]+);/mi', // hex encoded
  102. );
  103. // strip invalid XML 1.0 characters which are encoded as entities
  104. $data = preg_replace_callback($entities, function ($matches) {
  105. $code_point = $matches[2];
  106. // convert hex entity to decimal
  107. if (strtolower($matches[1]) === '&#x') {
  108. $code_point = hexdec($code_point);
  109. }
  110. $code_point = (int) $code_point;
  111. // replace invalid characters
  112. if ($code_point < 9
  113. || ($code_point > 10 && $code_point < 13)
  114. || ($code_point > 13 && $code_point < 32)
  115. || ($code_point > 55295 && $code_point < 57344)
  116. || ($code_point > 65533 && $code_point < 65536)
  117. || $code_point > 1114111
  118. ) {
  119. return '';
  120. };
  121. return $matches[0];
  122. }, $data);
  123. // strip every utf-8 character than isn't in the range of valid XML 1.0 characters
  124. return (string) preg_replace('/[^\x{0009}\x{000A}\x{000D}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', '', $data);
  125. }
  126. }