DOMWordsIterator.php 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. <?php
  2. /**
  3. * Iterates individual words of DOM text and CDATA nodes
  4. * while keeping track of their position in the document.
  5. *
  6. * Example:
  7. *
  8. * $doc = new DOMDocument();
  9. * $doc->load('example.xml');
  10. * foreach(new DOMWordsIterator($doc) as $word) echo $word;
  11. *
  12. * @author pjgalbraith http://www.pjgalbraith.com
  13. * @author porneL http://pornel.net (based on DOMLettersIterator available at http://pornel.net/source/domlettersiterator.php)
  14. * @license Public Domain
  15. *
  16. */
  17. final class DOMWordsIterator implements Iterator {
  18. private $start, $current;
  19. private $offset, $key, $words;
  20. /**
  21. * expects DOMElement or DOMDocument (see DOMDocument::load and DOMDocument::loadHTML)
  22. */
  23. function __construct(DOMNode $el)
  24. {
  25. if ($el instanceof DOMDocument) $this->start = $el->documentElement;
  26. else if ($el instanceof DOMElement) $this->start = $el;
  27. else throw new InvalidArgumentException("Invalid arguments, expected DOMElement or DOMDocument");
  28. }
  29. /**
  30. * Returns position in text as DOMText node and character offset.
  31. * (it's NOT a byte offset, you must use mb_substr() or similar to use this offset properly).
  32. * node may be NULL if iterator has finished.
  33. *
  34. * @return array
  35. */
  36. function currentWordPosition()
  37. {
  38. return array($this->current, $this->offset, $this->words);
  39. }
  40. /**
  41. * Returns DOMElement that is currently being iterated or NULL if iterator has finished.
  42. *
  43. * @return DOMElement
  44. */
  45. function currentElement()
  46. {
  47. return $this->current ? $this->current->parentNode : NULL;
  48. }
  49. // Implementation of Iterator interface
  50. function key()
  51. {
  52. return $this->key;
  53. }
  54. function next()
  55. {
  56. if (!$this->current) return;
  57. if ($this->current->nodeType == XML_TEXT_NODE || $this->current->nodeType == XML_CDATA_SECTION_NODE)
  58. {
  59. if ($this->offset == -1)
  60. {
  61. // fastest way to get individual Unicode chars and does not require mb_* functions
  62. //preg_match_all('/./us',$this->current->textContent,$m); $this->words = $m[0];
  63. $this->words = preg_split("/[\n\r\t ]+/", $this->current->textContent, -1, PREG_SPLIT_NO_EMPTY|PREG_SPLIT_OFFSET_CAPTURE);
  64. }
  65. $this->offset++;
  66. if ($this->offset < count($this->words)) {
  67. $this->key++;
  68. return;
  69. }
  70. $this->offset = -1;
  71. }
  72. while($this->current->nodeType == XML_ELEMENT_NODE && $this->current->firstChild)
  73. {
  74. $this->current = $this->current->firstChild;
  75. if ($this->current->nodeType == XML_TEXT_NODE || $this->current->nodeType == XML_CDATA_SECTION_NODE) return $this->next();
  76. }
  77. while(!$this->current->nextSibling && $this->current->parentNode)
  78. {
  79. $this->current = $this->current->parentNode;
  80. if ($this->current === $this->start) {$this->current = NULL; return;}
  81. }
  82. $this->current = $this->current->nextSibling;
  83. return $this->next();
  84. }
  85. function current()
  86. {
  87. if ($this->current) return $this->words[$this->offset][0];
  88. return NULL;
  89. }
  90. function valid()
  91. {
  92. return !!$this->current;
  93. }
  94. function rewind()
  95. {
  96. $this->offset = -1; $this->words = array();
  97. $this->current = $this->start;
  98. $this->next();
  99. }
  100. }