Truncator.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. <?php
  2. /**
  3. * @package Grav\Common\Helpers
  4. *
  5. * @copyright Copyright (C) 2015 - 2019 Trilby Media, LLC. All rights reserved.
  6. * @license MIT License; see LICENSE file for details.
  7. */
  8. namespace Grav\Common\Helpers;
  9. use DOMText;
  10. use DOMDocument;
  11. use DOMElement;
  12. use DOMNode;
  13. use DOMWordsIterator;
  14. use DOMLettersIterator;
  15. /**
  16. * This file is part of https://github.com/Bluetel-Solutions/twig-truncate-extension
  17. *
  18. * Copyright (c) 2015 Bluetel Solutions developers@bluetel.co.uk
  19. * Copyright (c) 2015 Alex Wilson ajw@bluetel.co.uk
  20. *
  21. * For the full copyright and license information, please view the LICENSE
  22. * file that was distributed with this source code.
  23. */
  24. class Truncator {
  25. /**
  26. * Safely truncates HTML by a given number of words.
  27. * @param string $html Input HTML.
  28. * @param int $limit Limit to how many words we preserve.
  29. * @param string $ellipsis String to use as ellipsis (if any).
  30. * @return string Safe truncated HTML.
  31. */
  32. public static function truncateWords($html, $limit = 0, $ellipsis = '')
  33. {
  34. if ($limit <= 0) {
  35. return $html;
  36. }
  37. $doc = self::htmlToDomDocument($html);
  38. $container = $doc->getElementsByTagName('div')->item(0);
  39. $container = $container->parentNode->removeChild($container);
  40. // Iterate over words.
  41. $words = new DOMWordsIterator($container);
  42. $truncated = false;
  43. foreach ($words as $word) {
  44. // If we have exceeded the limit, we delete the remainder of the content.
  45. if ($words->key() >= $limit) {
  46. // Grab current position.
  47. $currentWordPosition = $words->currentWordPosition();
  48. $curNode = $currentWordPosition[0];
  49. $offset = $currentWordPosition[1];
  50. $words = $currentWordPosition[2];
  51. $curNode->nodeValue = substr(
  52. $curNode->nodeValue,
  53. 0,
  54. $words[$offset][1] + strlen($words[$offset][0])
  55. );
  56. self::removeProceedingNodes($curNode, $container);
  57. if (!empty($ellipsis)) {
  58. self::insertEllipsis($curNode, $ellipsis);
  59. }
  60. $truncated = true;
  61. break;
  62. }
  63. }
  64. // Return original HTML if not truncated.
  65. if ($truncated) {
  66. $html = self::getCleanedHtml($doc, $container);
  67. }
  68. return $html;
  69. }
  70. /**
  71. * Safely truncates HTML by a given number of letters.
  72. * @param string $html Input HTML.
  73. * @param int $limit Limit to how many letters we preserve.
  74. * @param string $ellipsis String to use as ellipsis (if any).
  75. * @return string Safe truncated HTML.
  76. */
  77. public static function truncateLetters($html, $limit = 0, $ellipsis = '')
  78. {
  79. if ($limit <= 0) {
  80. return $html;
  81. }
  82. $doc = self::htmlToDomDocument($html);
  83. $container = $doc->getElementsByTagName('div')->item(0);
  84. $container = $container->parentNode->removeChild($container);
  85. // Iterate over letters.
  86. $letters = new DOMLettersIterator($container);
  87. $truncated = false;
  88. foreach ($letters as $letter) {
  89. // If we have exceeded the limit, we want to delete the remainder of this document.
  90. if ($letters->key() >= $limit) {
  91. $currentText = $letters->currentTextPosition();
  92. $currentText[0]->nodeValue = mb_substr($currentText[0]->nodeValue, 0, $currentText[1] + 1);
  93. self::removeProceedingNodes($currentText[0], $container);
  94. if (!empty($ellipsis)) {
  95. self::insertEllipsis($currentText[0], $ellipsis);
  96. }
  97. $truncated = true;
  98. break;
  99. }
  100. }
  101. // Return original HTML if not truncated.
  102. if ($truncated) {
  103. $html = self::getCleanedHtml($doc, $container);
  104. }
  105. return $html;
  106. }
  107. /**
  108. * Builds a DOMDocument object from a string containing HTML.
  109. * @param string $html HTML to load
  110. * @returns DOMDocument Returns a DOMDocument object.
  111. */
  112. public static function htmlToDomDocument($html)
  113. {
  114. if (!$html) {
  115. $html = '';
  116. }
  117. // Transform multibyte entities which otherwise display incorrectly.
  118. $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
  119. // Internal errors enabled as HTML5 not fully supported.
  120. libxml_use_internal_errors(true);
  121. // Instantiate new DOMDocument object, and then load in UTF-8 HTML.
  122. $dom = new DOMDocument();
  123. $dom->encoding = 'UTF-8';
  124. $dom->loadHTML("<div>$html</div>");
  125. return $dom;
  126. }
  127. /**
  128. * Removes all nodes after the current node.
  129. * @param DOMNode|DOMElement $domNode
  130. * @param DOMNode|DOMElement $topNode
  131. * @return void
  132. */
  133. private static function removeProceedingNodes($domNode, $topNode)
  134. {
  135. $nextNode = $domNode->nextSibling;
  136. if ($nextNode !== null) {
  137. self::removeProceedingNodes($nextNode, $topNode);
  138. $domNode->parentNode->removeChild($nextNode);
  139. } else {
  140. //scan upwards till we find a sibling
  141. $curNode = $domNode->parentNode;
  142. while ($curNode !== $topNode) {
  143. if ($curNode->nextSibling !== null) {
  144. $curNode = $curNode->nextSibling;
  145. self::removeProceedingNodes($curNode, $topNode);
  146. $curNode->parentNode->removeChild($curNode);
  147. break;
  148. }
  149. $curNode = $curNode->parentNode;
  150. }
  151. }
  152. }
  153. /**
  154. * Clean extra code
  155. *
  156. * @param DOMDocument $doc
  157. * @param $container
  158. * @return string
  159. */
  160. private static function getCleanedHTML(DOMDocument $doc, $container)
  161. {
  162. while ($doc->firstChild) {
  163. $doc->removeChild($doc->firstChild);
  164. }
  165. while ($container->firstChild ) {
  166. $doc->appendChild($container->firstChild);
  167. }
  168. $html = trim($doc->saveHTML());
  169. return $html;
  170. }
  171. /**
  172. * Inserts an ellipsis
  173. * @param DOMNode|DOMElement $domNode Element to insert after.
  174. * @param string $ellipsis Text used to suffix our document.
  175. * @return void
  176. */
  177. private static function insertEllipsis($domNode, $ellipsis)
  178. {
  179. $avoid = array('a', 'strong', 'em', 'h1', 'h2', 'h3', 'h4', 'h5'); //html tags to avoid appending the ellipsis to
  180. if ($domNode->parentNode->parentNode !== null && in_array($domNode->parentNode->nodeName, $avoid, true)) {
  181. // Append as text node to parent instead
  182. $textNode = new DOMText($ellipsis);
  183. if ($domNode->parentNode->parentNode->nextSibling) {
  184. $domNode->parentNode->parentNode->insertBefore($textNode, $domNode->parentNode->parentNode->nextSibling);
  185. } else {
  186. $domNode->parentNode->parentNode->appendChild($textNode);
  187. }
  188. } else {
  189. // Append to current node
  190. $domNode->nodeValue = rtrim($domNode->nodeValue) . $ellipsis;
  191. }
  192. }
  193. /**
  194. *
  195. */
  196. public function truncate(
  197. $text,
  198. $length = 100,
  199. $ending = '...',
  200. $exact = false,
  201. $considerHtml = true
  202. ) {
  203. if ($considerHtml) {
  204. // if the plain text is shorter than the maximum length, return the whole text
  205. if (strlen(preg_replace('/<.*?>/', '', $text)) <= $length) {
  206. return $text;
  207. }
  208. // splits all html-tags to scanable lines
  209. preg_match_all('/(<.+?>)?([^<>]*)/s', $text, $lines, PREG_SET_ORDER);
  210. $total_length = strlen($ending);
  211. $open_tags = array();
  212. $truncate = '';
  213. foreach ($lines as $line_matchings) {
  214. // if there is any html-tag in this line, handle it and add it (uncounted) to the output
  215. if (!empty($line_matchings[1])) {
  216. // if it's an "empty element" with or without xhtml-conform closing slash
  217. if (preg_match('/^<(\s*.+?\/\s*|\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param)(\s.+?)?)>$/is', $line_matchings[1])) {
  218. // do nothing
  219. // if tag is a closing tag
  220. } else if (preg_match('/^<\s*\/([^\s]+?)\s*>$/s', $line_matchings[1], $tag_matchings)) {
  221. // delete tag from $open_tags list
  222. $pos = array_search($tag_matchings[1], $open_tags);
  223. if ($pos !== false) {
  224. unset($open_tags[$pos]);
  225. }
  226. // if tag is an opening tag
  227. } else if (preg_match('/^<\s*([^\s>!]+).*?>$/s', $line_matchings[1], $tag_matchings)) {
  228. // add tag to the beginning of $open_tags list
  229. array_unshift($open_tags, strtolower($tag_matchings[1]));
  230. }
  231. // add html-tag to $truncate'd text
  232. $truncate .= $line_matchings[1];
  233. }
  234. // calculate the length of the plain text part of the line; handle entities as one character
  235. $content_length = strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};/i', ' ', $line_matchings[2]));
  236. if ($total_length+$content_length> $length) {
  237. // the number of characters which are left
  238. $left = $length - $total_length;
  239. $entities_length = 0;
  240. // search for html entities
  241. if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};/i', $line_matchings[2], $entities, PREG_OFFSET_CAPTURE)) {
  242. // calculate the real length of all entities in the legal range
  243. foreach ($entities[0] as $entity) {
  244. if ($entity[1]+1-$entities_length <= $left) {
  245. $left--;
  246. $entities_length += strlen($entity[0]);
  247. } else {
  248. // no more characters left
  249. break;
  250. }
  251. }
  252. }
  253. $truncate .= substr($line_matchings[2], 0, $left+$entities_length);
  254. // maximum lenght is reached, so get off the loop
  255. break;
  256. } else {
  257. $truncate .= $line_matchings[2];
  258. $total_length += $content_length;
  259. }
  260. // if the maximum length is reached, get off the loop
  261. if($total_length>= $length) {
  262. break;
  263. }
  264. }
  265. } else {
  266. if (strlen($text) <= $length) {
  267. return $text;
  268. } else {
  269. $truncate = substr($text, 0, $length - strlen($ending));
  270. }
  271. }
  272. // if the words shouldn't be cut in the middle...
  273. if (!$exact) {
  274. // ...search the last occurance of a space...
  275. $spacepos = strrpos($truncate, ' ');
  276. if (isset($spacepos)) {
  277. // ...and cut the text in this position
  278. $truncate = substr($truncate, 0, $spacepos);
  279. }
  280. }
  281. // add the defined ending to the text
  282. $truncate .= $ending;
  283. if($considerHtml) {
  284. // close all unclosed html-tags
  285. foreach ($open_tags as $tag) {
  286. $truncate .= '</' . $tag . '>';
  287. }
  288. }
  289. return $truncate;
  290. }
  291. }