Truncator.php 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. <?php
  2. namespace Grav\Common\Helpers;
  3. use DOMDocument;
  4. /**
  5. * This file is part of urodoz/truncateHTML.
  6. *
  7. * (c) Albert Lacarta <urodoz@gmail.com>
  8. *
  9. * For the full copyright and license information, please view the LICENSE
  10. * file that was distributed with this source code.
  11. */
  12. class Truncator {
  13. public static $default_options = array(
  14. 'ellipsis' => '…',
  15. 'break' => ' ',
  16. 'length_in_chars' => false,
  17. 'word_safe' => false,
  18. );
  19. // These tags are allowed to have an ellipsis inside
  20. public static $ellipsable_tags = array(
  21. 'p', 'ol', 'ul', 'li',
  22. 'div', 'header', 'article', 'nav',
  23. 'section', 'footer', 'aside',
  24. 'dd', 'dt', 'dl',
  25. );
  26. public static $self_closing_tags = array(
  27. 'br', 'hr', 'img',
  28. );
  29. /**
  30. * Truncate given HTML string to specified length.
  31. * If length_in_chars is false it's trimmed by number
  32. * of words, otherwise by number of characters.
  33. *
  34. * @param string $html
  35. * @param integer $length
  36. * @param string|array $opts
  37. * @return string
  38. */
  39. public static function truncate($html, $length, $opts=array())
  40. {
  41. if (is_string($opts)) $opts = array('ellipsis' => $opts);
  42. $opts = array_merge(static::$default_options, $opts);
  43. // wrap the html in case it consists of adjacent nodes like <p>foo</p><p>bar</p>
  44. $html = mb_convert_encoding("<div>".$html."</div>", 'HTML-ENTITIES', 'UTF-8');
  45. $root_node = null;
  46. // Parse using HTML5Lib if it's available.
  47. if (class_exists('HTML5Lib\\Parser')) {
  48. try {
  49. $doc = \HTML5Lib\Parser::parse($html);
  50. $root_node = $doc->documentElement->lastChild->lastChild;
  51. }
  52. catch (\Exception $e) {
  53. ;
  54. }
  55. }
  56. if ($root_node === null) {
  57. // HTML5Lib not available so we'll have to use DOMDocument
  58. // We'll only be able to parse HTML5 if it's valid XML
  59. $doc = new DOMDocument('4.01', 'utf-8');
  60. $doc->formatOutput = false;
  61. $doc->preserveWhiteSpace = true;
  62. // loadHTML will fail with HTML5 tags (article, nav, etc)
  63. // so we need to suppress errors and if it fails to parse we
  64. // retry with the XML parser instead
  65. $prev_use_errors = libxml_use_internal_errors(true);
  66. if ($doc->loadHTML($html)) {
  67. $root_node = $doc->documentElement->lastChild->lastChild;
  68. }
  69. else if ($doc->loadXML($html)) {
  70. $root_node = $doc->documentElement;
  71. }
  72. else {
  73. libxml_use_internal_errors($prev_use_errors);
  74. throw new \RuntimeException;
  75. }
  76. libxml_use_internal_errors($prev_use_errors);
  77. }
  78. list($text, $_, $opts) = static::truncateNode($doc, $root_node, $length, $opts);
  79. $text = mb_substr(mb_substr($text, 0, -6), 5);
  80. return $text;
  81. }
  82. protected static function truncateNode($doc, $node, $length, $opts)
  83. {
  84. if ($length === 0 && !static::ellipsable($node)) {
  85. return array('', 1, $opts);
  86. }
  87. list($inner, $remaining, $opts) = static::innerTruncate($doc, $node, $length, $opts);
  88. if (0 === mb_strlen($inner)) {
  89. return array(in_array(mb_strtolower($node->nodeName), static::$self_closing_tags) ? $doc->saveXML($node) : "", $length - $remaining, $opts);
  90. }
  91. while($node->firstChild) {
  92. $node->removeChild($node->firstChild);
  93. }
  94. $newNode = $doc->createDocumentFragment();
  95. // handle the ampersand
  96. $newNode->appendXml(static::xmlEscape($inner));
  97. $node->appendChild($newNode);
  98. return array($doc->saveXML($node), $length - $remaining, $opts);
  99. }
  100. protected static function innerTruncate($doc, $node, $length, $opts)
  101. {
  102. $inner = '';
  103. $remaining = $length;
  104. foreach($node->childNodes as $childNode) {
  105. if ($childNode->nodeType === XML_ELEMENT_NODE) {
  106. list($txt, $nb, $opts) = static::truncateNode($doc, $childNode, $remaining, $opts);
  107. }
  108. else if ($childNode->nodeType === XML_TEXT_NODE) {
  109. list($txt, $nb, $opts) = static::truncateText($doc, $childNode, $remaining, $opts);
  110. } else {
  111. $txt = '';
  112. $nb = 0;
  113. }
  114. // unhandle the ampersand
  115. $txt = static::xmlUnescape($txt);
  116. $remaining -= $nb;
  117. $inner .= $txt;
  118. if ($remaining < 0) {
  119. if (static::ellipsable($node)) {
  120. $inner = preg_replace('/(?:[\s\pP]+|(?:&(?:[a-z]+|#[0-9]+);?))*$/', '', $inner).$opts['ellipsis'];
  121. $opts['ellipsis'] = '';
  122. $opts['was_truncated'] = true;
  123. }
  124. break;
  125. }
  126. }
  127. return array($inner, $remaining, $opts);
  128. }
  129. protected static function truncateText($doc, $node, $length, $opts)
  130. {
  131. $string = $node->textContent;
  132. if ($opts['length_in_chars']) {
  133. $count = mb_strlen($string);
  134. if ($count <= $length && $length > 0) {
  135. return array($string, $count, $opts);
  136. }
  137. if ($opts['word_safe']) {
  138. if (false !== ($breakpoint = mb_strpos($string, $opts['break'], $length))) {
  139. if ($breakpoint < mb_strlen($string) - 1) {
  140. $string = mb_substr($string, 0, $breakpoint) . $opts['break'];
  141. }
  142. }
  143. return array($string, $count, $opts);
  144. }
  145. return array(mb_substr($node->textContent, 0, $length), $count, $opts);
  146. }
  147. else {
  148. preg_match_all('/\s*\S+/', $string, $words);
  149. $words = $words[0];
  150. $count = count($words);
  151. if ($count <= $length && $length > 0) {
  152. return array($xhtml, $count, $opts);
  153. }
  154. return array(implode('', array_slice($words, 0, $length)), $count, $opts);
  155. }
  156. }
  157. protected static function ellipsable($node)
  158. {
  159. return ($node instanceof DOMDocument)
  160. || in_array(mb_strtolower($node->nodeName), static::$ellipsable_tags)
  161. ;
  162. }
  163. protected static function xmlEscape($string)
  164. {
  165. $string = str_replace('&', '&amp;', $string);
  166. $string = str_replace('<?', '&lt;?', $string);
  167. return $string;
  168. }
  169. protected static function xmlUnescape($string)
  170. {
  171. $string = str_replace('&amp;', '&', $string);
  172. $string = str_replace('&lt;?', '<?', $string);
  173. return $string;
  174. }
  175. }