Truncator.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. <?php
  2. /**
  3. * @package Grav\Common\Helpers
  4. *
  5. * @copyright Copyright (c) 2015 - 2023 Trilby Media, LLC. All rights reserved.
  6. * @license MIT License; see LICENSE file for details.
  7. */
  8. namespace Grav\Common\Helpers;
  9. use DOMText;
  10. use DOMDocument;
  11. use DOMElement;
  12. use DOMNode;
  13. use DOMWordsIterator;
  14. use DOMLettersIterator;
  15. use function in_array;
  16. use function strlen;
  17. /**
  18. * This file is part of https://github.com/Bluetel-Solutions/twig-truncate-extension
  19. *
  20. * Copyright (c) 2015 Bluetel Solutions developers@bluetel.co.uk
  21. * Copyright (c) 2015 Alex Wilson ajw@bluetel.co.uk
  22. *
  23. * For the full copyright and license information, please view the LICENSE
  24. * file that was distributed with this source code.
  25. */
  26. class Truncator
  27. {
  28. /**
  29. * Safely truncates HTML by a given number of words.
  30. *
  31. * @param string $html Input HTML.
  32. * @param int $limit Limit to how many words we preserve.
  33. * @param string $ellipsis String to use as ellipsis (if any).
  34. * @return string Safe truncated HTML.
  35. */
  36. public static function truncateWords($html, $limit = 0, $ellipsis = '')
  37. {
  38. if ($limit <= 0) {
  39. return $html;
  40. }
  41. $doc = self::htmlToDomDocument($html);
  42. $container = $doc->getElementsByTagName('div')->item(0);
  43. $container = $container->parentNode->removeChild($container);
  44. // Iterate over words.
  45. $words = new DOMWordsIterator($container);
  46. $truncated = false;
  47. foreach ($words as $word) {
  48. // If we have exceeded the limit, we delete the remainder of the content.
  49. if ($words->key() >= $limit) {
  50. // Grab current position.
  51. $currentWordPosition = $words->currentWordPosition();
  52. $curNode = $currentWordPosition[0];
  53. $offset = $currentWordPosition[1];
  54. $words = $currentWordPosition[2];
  55. $curNode->nodeValue = substr(
  56. $curNode->nodeValue,
  57. 0,
  58. $words[$offset][1] + strlen($words[$offset][0])
  59. );
  60. self::removeProceedingNodes($curNode, $container);
  61. if (!empty($ellipsis)) {
  62. self::insertEllipsis($curNode, $ellipsis);
  63. }
  64. $truncated = true;
  65. break;
  66. }
  67. }
  68. // Return original HTML if not truncated.
  69. if ($truncated) {
  70. $html = self::getCleanedHtml($doc, $container);
  71. }
  72. return $html;
  73. }
  74. /**
  75. * Safely truncates HTML by a given number of letters.
  76. *
  77. * @param string $html Input HTML.
  78. * @param int $limit Limit to how many letters we preserve.
  79. * @param string $ellipsis String to use as ellipsis (if any).
  80. * @return string Safe truncated HTML.
  81. */
  82. public static function truncateLetters($html, $limit = 0, $ellipsis = '')
  83. {
  84. if ($limit <= 0) {
  85. return $html;
  86. }
  87. $doc = self::htmlToDomDocument($html);
  88. $container = $doc->getElementsByTagName('div')->item(0);
  89. $container = $container->parentNode->removeChild($container);
  90. // Iterate over letters.
  91. $letters = new DOMLettersIterator($container);
  92. $truncated = false;
  93. foreach ($letters as $letter) {
  94. // If we have exceeded the limit, we want to delete the remainder of this document.
  95. if ($letters->key() >= $limit) {
  96. $currentText = $letters->currentTextPosition();
  97. $currentText[0]->nodeValue = mb_substr($currentText[0]->nodeValue, 0, $currentText[1] + 1);
  98. self::removeProceedingNodes($currentText[0], $container);
  99. if (!empty($ellipsis)) {
  100. self::insertEllipsis($currentText[0], $ellipsis);
  101. }
  102. $truncated = true;
  103. break;
  104. }
  105. }
  106. // Return original HTML if not truncated.
  107. if ($truncated) {
  108. $html = self::getCleanedHtml($doc, $container);
  109. }
  110. return $html;
  111. }
  112. /**
  113. * Builds a DOMDocument object from a string containing HTML.
  114. *
  115. * @param string $html HTML to load
  116. * @return DOMDocument Returns a DOMDocument object.
  117. */
  118. public static function htmlToDomDocument($html)
  119. {
  120. if (!$html) {
  121. $html = '';
  122. }
  123. // Transform multibyte entities which otherwise display incorrectly.
  124. $html = mb_encode_numericentity($html, [0x80, 0x10FFFF, 0, ~0], 'UTF-8');
  125. // Internal errors enabled as HTML5 not fully supported.
  126. libxml_use_internal_errors(true);
  127. // Instantiate new DOMDocument object, and then load in UTF-8 HTML.
  128. $dom = new DOMDocument();
  129. $dom->encoding = 'UTF-8';
  130. $dom->loadHTML("<div>$html</div>");
  131. return $dom;
  132. }
  133. /**
  134. * Removes all nodes after the current node.
  135. *
  136. * @param DOMNode|DOMElement $domNode
  137. * @param DOMNode|DOMElement $topNode
  138. * @return void
  139. */
  140. private static function removeProceedingNodes($domNode, $topNode)
  141. {
  142. /** @var DOMNode|null $nextNode */
  143. $nextNode = $domNode->nextSibling;
  144. if ($nextNode !== null) {
  145. self::removeProceedingNodes($nextNode, $topNode);
  146. $domNode->parentNode->removeChild($nextNode);
  147. } else {
  148. //scan upwards till we find a sibling
  149. $curNode = $domNode->parentNode;
  150. while ($curNode !== $topNode) {
  151. if ($curNode->nextSibling !== null) {
  152. $curNode = $curNode->nextSibling;
  153. self::removeProceedingNodes($curNode, $topNode);
  154. $curNode->parentNode->removeChild($curNode);
  155. break;
  156. }
  157. $curNode = $curNode->parentNode;
  158. }
  159. }
  160. }
  161. /**
  162. * Clean extra code
  163. *
  164. * @param DOMDocument $doc
  165. * @param DOMNode $container
  166. * @return string
  167. */
  168. private static function getCleanedHTML(DOMDocument $doc, DOMNode $container)
  169. {
  170. while ($doc->firstChild) {
  171. $doc->removeChild($doc->firstChild);
  172. }
  173. while ($container->firstChild) {
  174. $doc->appendChild($container->firstChild);
  175. }
  176. return trim($doc->saveHTML());
  177. }
  178. /**
  179. * Inserts an ellipsis
  180. *
  181. * @param DOMNode|DOMElement $domNode Element to insert after.
  182. * @param string $ellipsis Text used to suffix our document.
  183. * @return void
  184. */
  185. private static function insertEllipsis($domNode, $ellipsis)
  186. {
  187. $avoid = array('a', 'strong', 'em', 'h1', 'h2', 'h3', 'h4', 'h5'); //html tags to avoid appending the ellipsis to
  188. if ($domNode->parentNode->parentNode !== null && in_array($domNode->parentNode->nodeName, $avoid, true)) {
  189. // Append as text node to parent instead
  190. $textNode = new DOMText($ellipsis);
  191. /** @var DOMNode|null $nextSibling */
  192. $nextSibling = $domNode->parentNode->parentNode->nextSibling;
  193. if ($nextSibling) {
  194. $domNode->parentNode->parentNode->insertBefore($textNode, $domNode->parentNode->parentNode->nextSibling);
  195. } else {
  196. $domNode->parentNode->parentNode->appendChild($textNode);
  197. }
  198. } else {
  199. // Append to current node
  200. $domNode->nodeValue = rtrim($domNode->nodeValue) . $ellipsis;
  201. }
  202. }
  203. /**
  204. * @param string $text
  205. * @param int $length
  206. * @param string $ending
  207. * @param bool $exact
  208. * @param bool $considerHtml
  209. * @return string
  210. */
  211. public function truncate(
  212. $text,
  213. $length = 100,
  214. $ending = '...',
  215. $exact = false,
  216. $considerHtml = true
  217. ) {
  218. if ($considerHtml) {
  219. // if the plain text is shorter than the maximum length, return the whole text
  220. if (strlen(preg_replace('/<.*?>/', '', $text)) <= $length) {
  221. return $text;
  222. }
  223. // splits all html-tags to scanable lines
  224. preg_match_all('/(<.+?>)?([^<>]*)/s', $text, $lines, PREG_SET_ORDER);
  225. $total_length = strlen($ending);
  226. $truncate = '';
  227. $open_tags = [];
  228. foreach ($lines as $line_matchings) {
  229. // if there is any html-tag in this line, handle it and add it (uncounted) to the output
  230. if (!empty($line_matchings[1])) {
  231. // if it's an "empty element" with or without xhtml-conform closing slash
  232. if (preg_match('/^<(\s*.+?\/\s*|\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param)(\s.+?)?)>$/is', $line_matchings[1])) {
  233. // do nothing
  234. // if tag is a closing tag
  235. } elseif (preg_match('/^<\s*\/([^\s]+?)\s*>$/s', $line_matchings[1], $tag_matchings)) {
  236. // delete tag from $open_tags list
  237. $pos = array_search($tag_matchings[1], $open_tags);
  238. if ($pos !== false) {
  239. unset($open_tags[$pos]);
  240. }
  241. // if tag is an opening tag
  242. } elseif (preg_match('/^<\s*([^\s>!]+).*?>$/s', $line_matchings[1], $tag_matchings)) {
  243. // add tag to the beginning of $open_tags list
  244. array_unshift($open_tags, strtolower($tag_matchings[1]));
  245. }
  246. // add html-tag to $truncate'd text
  247. $truncate .= $line_matchings[1];
  248. }
  249. // calculate the length of the plain text part of the line; handle entities as one character
  250. $content_length = strlen(preg_replace('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};/i', ' ', $line_matchings[2]));
  251. if ($total_length+$content_length> $length) {
  252. // the number of characters which are left
  253. $left = $length - $total_length;
  254. $entities_length = 0;
  255. // search for html entities
  256. if (preg_match_all('/&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};/i', $line_matchings[2], $entities, PREG_OFFSET_CAPTURE)) {
  257. // calculate the real length of all entities in the legal range
  258. foreach ($entities[0] as $entity) {
  259. if ($entity[1]+1-$entities_length <= $left) {
  260. $left--;
  261. $entities_length += strlen($entity[0]);
  262. } else {
  263. // no more characters left
  264. break;
  265. }
  266. }
  267. }
  268. $truncate .= substr($line_matchings[2], 0, $left+$entities_length);
  269. // maximum lenght is reached, so get off the loop
  270. break;
  271. } else {
  272. $truncate .= $line_matchings[2];
  273. $total_length += $content_length;
  274. }
  275. // if the maximum length is reached, get off the loop
  276. if ($total_length>= $length) {
  277. break;
  278. }
  279. }
  280. } else {
  281. if (strlen($text) <= $length) {
  282. return $text;
  283. }
  284. $truncate = substr($text, 0, $length - strlen($ending));
  285. }
  286. // if the words shouldn't be cut in the middle...
  287. if (!$exact) {
  288. // ...search the last occurance of a space...
  289. $spacepos = strrpos($truncate, ' ');
  290. if (false !== $spacepos) {
  291. // ...and cut the text in this position
  292. $truncate = substr($truncate, 0, $spacepos);
  293. }
  294. }
  295. // add the defined ending to the text
  296. $truncate .= $ending;
  297. if (isset($open_tags)) {
  298. // close all unclosed html-tags
  299. foreach ($open_tags as $tag) {
  300. $truncate .= '</' . $tag . '>';
  301. }
  302. }
  303. return $truncate;
  304. }
  305. }