DOMLettersIterator.php 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. <?php
  2. /**
  3. * Iterates individual characters (Unicode codepoints) of DOM text and CDATA nodes
  4. * while keeping track of their position in the document.
  5. *
  6. * Example:
  7. *
  8. * $doc = new DOMDocument();
  9. * $doc->load('example.xml');
  10. * foreach(new DOMLettersIterator($doc) as $letter) echo $letter;
  11. *
  12. * NB: If you only need characters without their position
  13. * in the document, use DOMNode->textContent instead.
  14. *
  15. * @author porneL http://pornel.net
  16. * @license Public Domain
  17. * @url https://github.com/antoligy/dom-string-iterators
  18. *
  19. * @implements Iterator<int,string>
  20. */
  21. final class DOMLettersIterator implements Iterator
  22. {
  23. /** @var DOMElement */
  24. private $start;
  25. /** @var DOMElement|null */
  26. private $current;
  27. /** @var int */
  28. private $offset = -1;
  29. /** @var int|null */
  30. private $key;
  31. /** @var array<int,string>|null */
  32. private $letters;
  33. /**
  34. * expects DOMElement or DOMDocument (see DOMDocument::load and DOMDocument::loadHTML)
  35. *
  36. * @param DOMNode $el
  37. */
  38. public function __construct(DOMNode $el)
  39. {
  40. if ($el instanceof DOMDocument) {
  41. $el = $el->documentElement;
  42. }
  43. if (!$el instanceof DOMElement) {
  44. throw new InvalidArgumentException('Invalid arguments, expected DOMElement or DOMDocument');
  45. }
  46. $this->start = $el;
  47. }
  48. /**
  49. * Returns position in text as DOMText node and character offset.
  50. * (it's NOT a byte offset, you must use mb_substr() or similar to use this offset properly).
  51. * node may be NULL if iterator has finished.
  52. *
  53. * @return array
  54. */
  55. public function currentTextPosition(): array
  56. {
  57. return [$this->current, $this->offset];
  58. }
  59. /**
  60. * Returns DOMElement that is currently being iterated or NULL if iterator has finished.
  61. *
  62. * @return DOMElement|null
  63. */
  64. public function currentElement(): ?DOMElement
  65. {
  66. return $this->current ? $this->current->parentNode : null;
  67. }
  68. // Implementation of Iterator interface
  69. /**
  70. * @return int|null
  71. */
  72. public function key(): ?int
  73. {
  74. return $this->key;
  75. }
  76. /**
  77. * @return void
  78. */
  79. public function next(): void
  80. {
  81. if (null === $this->current) {
  82. return;
  83. }
  84. if ($this->current->nodeType === XML_TEXT_NODE || $this->current->nodeType === XML_CDATA_SECTION_NODE) {
  85. if ($this->offset === -1) {
  86. preg_match_all('/./us', $this->current->textContent, $m);
  87. $this->letters = $m[0];
  88. }
  89. $this->offset++;
  90. $this->key++;
  91. if ($this->letters && $this->offset < count($this->letters)) {
  92. return;
  93. }
  94. $this->offset = -1;
  95. }
  96. while ($this->current->nodeType === XML_ELEMENT_NODE && $this->current->firstChild) {
  97. $this->current = $this->current->firstChild;
  98. if ($this->current->nodeType === XML_TEXT_NODE || $this->current->nodeType === XML_CDATA_SECTION_NODE) {
  99. $this->next();
  100. return;
  101. }
  102. }
  103. while (!$this->current->nextSibling && $this->current->parentNode) {
  104. $this->current = $this->current->parentNode;
  105. if ($this->current === $this->start) {
  106. $this->current = null;
  107. return;
  108. }
  109. }
  110. $this->current = $this->current->nextSibling;
  111. $this->next();
  112. }
  113. /**
  114. * Return the current element
  115. * @link https://php.net/manual/en/iterator.current.php
  116. *
  117. * @return string|null
  118. */
  119. public function current(): ?string
  120. {
  121. return $this->letters ? $this->letters[$this->offset] : null;
  122. }
  123. /**
  124. * Checks if current position is valid
  125. * @link https://php.net/manual/en/iterator.valid.php
  126. *
  127. * @return bool
  128. */
  129. public function valid(): bool
  130. {
  131. return (bool)$this->current;
  132. }
  133. /**
  134. * @return void
  135. */
  136. public function rewind(): void
  137. {
  138. $this->current = $this->start;
  139. $this->offset = -1;
  140. $this->key = 0;
  141. $this->letters = [];
  142. $this->next();
  143. }
  144. }