XmlParser.php 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. <?php
  2. namespace PicoFeed\Parser;
  3. use DOMDocument;
  4. use SimpleXMLElement;
  5. use Laminas\Xml\Exception\RuntimeException;
  6. use Laminas\Xml\Security;
  7. /**
  8. * XML parser class.
  9. *
  10. * Checks for XML eXternal Entity (XXE) and XML Entity Expansion (XEE) attacks on XML documents
  11. *
  12. * @package PicoFeed\Parser
  13. * @author Frederic Guillot
  14. */
  15. class XmlParser
  16. {
  17. protected static $errors = [];
  18. /**
  19. * Get a SimpleXmlElement instance or return false.
  20. *
  21. * @static
  22. * @param string $input XML content
  23. * @return mixed
  24. */
  25. public static function getSimpleXml($input)
  26. {
  27. return self::scan($input);
  28. }
  29. /**
  30. * Get a DomDocument instance or return false.
  31. *
  32. * @static
  33. * @param string $input XML content
  34. * @return DOMDocument|bool
  35. */
  36. public static function getDomDocument($input)
  37. {
  38. if (empty($input)) {
  39. return false;
  40. }
  41. $dom = self::scan($input, new DOMDocument());
  42. // The document is empty, there is probably some parsing errors
  43. if ($dom && $dom->childNodes->length === 0) {
  44. return false;
  45. }
  46. return $dom;
  47. }
  48. /**
  49. * Small wrapper around Laminas Xml to turn their exceptions into PicoFeed exceptions
  50. *
  51. * @static
  52. * @access private
  53. * @param string $input
  54. * @param DOMDocument $dom
  55. * @throws XmlEntityException
  56. * @return SimpleXMLElement|DomDocument|boolean
  57. */
  58. private static function scan($input, $dom = null)
  59. {
  60. try {
  61. return Security::scan($input, $dom);
  62. } catch(RuntimeException $e) {
  63. throw new XmlEntityException($e->getMessage());
  64. }
  65. }
  66. /**
  67. * Load HTML document by using a DomDocument instance or return false on failure.
  68. *
  69. * @static
  70. * @access public
  71. * @param string $input XML content
  72. * @return DOMDocument
  73. */
  74. public static function getHtmlDocument($input)
  75. {
  76. $dom = new DomDocument();
  77. if (empty($input)) {
  78. return $dom;
  79. }
  80. libxml_use_internal_errors(true);
  81. if (version_compare(PHP_VERSION, '5.4.0', '>=')) {
  82. $dom->loadHTML($input, LIBXML_NONET);
  83. } else {
  84. $dom->loadHTML($input);
  85. }
  86. self::$errors = [];
  87. foreach (libxml_get_errors() as $error) {
  88. self::$errors[] = sprintf('XML error: %s (Line: %d - Column: %d - Code: %d)',
  89. $error->message,
  90. $error->line,
  91. $error->column,
  92. $error->code
  93. );
  94. }
  95. libxml_use_internal_errors(false);
  96. return $dom;
  97. }
  98. /**
  99. * Convert a HTML document to XML.
  100. *
  101. * @static
  102. * @access public
  103. * @param string $html HTML document
  104. * @return string
  105. */
  106. public static function htmlToXml($html)
  107. {
  108. $dom = self::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
  109. return $dom->saveXML($dom->getElementsByTagName('body')->item(0));
  110. }
  111. /**
  112. * Get XML parser errors.
  113. *
  114. * @static
  115. * @access public
  116. * @return string
  117. */
  118. public static function getErrors()
  119. {
  120. return implode(', ', self::$errors);
  121. }
  122. /**
  123. * Get the encoding from a xml tag.
  124. *
  125. * @static
  126. * @access public
  127. * @param string $data Input data
  128. * @return string
  129. */
  130. public static function getEncodingFromXmlTag($data)
  131. {
  132. $encoding = '';
  133. if (strpos($data, '<?xml') !== false) {
  134. $data = substr($data, 0, strrpos($data, '?>'));
  135. $data = str_replace("'", '"', $data);
  136. $p1 = strpos($data, 'encoding=');
  137. $p2 = strpos($data, '"', $p1 + 10);
  138. if ($p1 !== false && $p2 !== false) {
  139. $encoding = substr($data, $p1 + 10, $p2 - $p1 - 10);
  140. $encoding = strtolower($encoding);
  141. }
  142. }
  143. return $encoding;
  144. }
  145. /**
  146. * Get the charset from a meta tag.
  147. *
  148. * @static
  149. * @access public
  150. * @param string $data Input data
  151. * @return string
  152. */
  153. public static function getEncodingFromMetaTag($data)
  154. {
  155. $encoding = '';
  156. if (preg_match('/<meta.*?charset\s*=\s*["\']?\s*([^"\'\s\/>;]+)/i', $data, $match) === 1) {
  157. $encoding = strtolower($match[1]);
  158. }
  159. return $encoding;
  160. }
  161. /**
  162. * Rewrite XPath query to use namespace-uri and local-name derived from prefix.
  163. *
  164. * @static
  165. * @access public
  166. * @param string $query XPath query
  167. * @param array $ns Prefix to namespace URI mapping
  168. * @return string
  169. */
  170. public static function replaceXPathPrefixWithNamespaceURI($query, array $ns)
  171. {
  172. return preg_replace_callback('/([A-Z0-9]+):([A-Z0-9]+)/iu', function ($matches) use ($ns) {
  173. // don't try to map the special prefix XML
  174. if (strtolower($matches[1]) === 'xml') {
  175. return $matches[0];
  176. }
  177. return '*[namespace-uri()="'.$ns[$matches[1]].'" and local-name()="'.$matches[2].'"]';
  178. },
  179. $query);
  180. }
  181. /**
  182. * Get the result elements of a XPath query.
  183. *
  184. * @static
  185. * @access public
  186. * @param SimpleXMLElement $xml XML element
  187. * @param string $query XPath query
  188. * @param array $ns Prefix to namespace URI mapping
  189. * @return SimpleXMLElement[]
  190. */
  191. public static function getXPathResult(SimpleXMLElement $xml, $query, array $ns = array())
  192. {
  193. if (!empty($ns)) {
  194. $query = static::replaceXPathPrefixWithNamespaceURI($query, $ns);
  195. }
  196. return $xml->xpath($query);
  197. }
  198. /**
  199. * Get the first Xpath result or SimpleXMLElement value
  200. *
  201. * @static
  202. * @access public
  203. * @param mixed $value
  204. * @return string
  205. */
  206. public static function getValue($value)
  207. {
  208. $result = '';
  209. if (is_array($value) && count($value) > 0) {
  210. $result = (string) $value[0];
  211. } elseif (is_a($value, 'SimpleXMLElement')) {
  212. return $result = (string) $value;
  213. }
  214. return trim($result);
  215. }
  216. }