123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- <?php
- /**
- * Iterates individual words of DOM text and CDATA nodes
- * while keeping track of their position in the document.
- *
- * Example:
- *
- * $doc = new DOMDocument();
- * $doc->load('example.xml');
- * foreach(new DOMWordsIterator($doc) as $word) echo $word;
- *
- * @author pjgalbraith http://www.pjgalbraith.com
- * @author porneL http://pornel.net (based on DOMLettersIterator available at http://pornel.net/source/domlettersiterator.php)
- * @license Public Domain
- *
- */
- final class DOMWordsIterator implements Iterator {
-
- private $start, $current;
- private $offset, $key, $words;
- /**
- * expects DOMElement or DOMDocument (see DOMDocument::load and DOMDocument::loadHTML)
- */
- function __construct(DOMNode $el)
- {
- if ($el instanceof DOMDocument) $this->start = $el->documentElement;
- else if ($el instanceof DOMElement) $this->start = $el;
- else throw new InvalidArgumentException("Invalid arguments, expected DOMElement or DOMDocument");
- }
-
- /**
- * Returns position in text as DOMText node and character offset.
- * (it's NOT a byte offset, you must use mb_substr() or similar to use this offset properly).
- * node may be NULL if iterator has finished.
- *
- * @return array
- */
- function currentWordPosition()
- {
- return array($this->current, $this->offset, $this->words);
- }
- /**
- * Returns DOMElement that is currently being iterated or NULL if iterator has finished.
- *
- * @return DOMElement
- */
- function currentElement()
- {
- return $this->current ? $this->current->parentNode : NULL;
- }
-
- // Implementation of Iterator interface
- function key()
- {
- return $this->key;
- }
-
- function next()
- {
- if (!$this->current) return;
- if ($this->current->nodeType == XML_TEXT_NODE || $this->current->nodeType == XML_CDATA_SECTION_NODE)
- {
- if ($this->offset == -1)
- {
- // fastest way to get individual Unicode chars and does not require mb_* functions
- //preg_match_all('/./us',$this->current->textContent,$m); $this->words = $m[0];
- $this->words = preg_split("/[\n\r\t ]+/", $this->current->textContent, -1, PREG_SPLIT_NO_EMPTY|PREG_SPLIT_OFFSET_CAPTURE);
- }
- $this->offset++;
-
- if ($this->offset < count($this->words)) {
- $this->key++;
- return;
- }
- $this->offset = -1;
- }
- while($this->current->nodeType == XML_ELEMENT_NODE && $this->current->firstChild)
- {
- $this->current = $this->current->firstChild;
- if ($this->current->nodeType == XML_TEXT_NODE || $this->current->nodeType == XML_CDATA_SECTION_NODE) return $this->next();
- }
- while(!$this->current->nextSibling && $this->current->parentNode)
- {
- $this->current = $this->current->parentNode;
- if ($this->current === $this->start) {$this->current = NULL; return;}
- }
- $this->current = $this->current->nextSibling;
- return $this->next();
- }
- function current()
- {
- if ($this->current) return $this->words[$this->offset][0];
- return NULL;
- }
- function valid()
- {
- return !!$this->current;
- }
- function rewind()
- {
- $this->offset = -1; $this->words = array();
- $this->current = $this->start;
- $this->next();
- }
- }
|