123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382 |
- <?php
- /**
- * @file
- * Contains CSV Parser.
- *
- * Functions in this file are independent of the Feeds specific implementation.
- * Thanks to jpetso http://drupal.org/user/56020 for most of the code in this
- * file.
- */
- /**
- * Text lines from file iterator.
- */
- class ParserCSVIterator implements Iterator {
- private $handle;
- private $currentLine;
- private $currentPos;
- public function __construct($filepath) {
- $this->handle = fopen($filepath, 'r');
- $this->currentLine = NULL;
- $this->currentPos = NULL;
- }
- function __destruct() {
- if ($this->handle) {
- fclose($this->handle);
- }
- }
- public function rewind($pos = 0) {
- if ($this->handle) {
- fseek($this->handle, $pos);
- $this->next();
- }
- }
- public function next() {
- if ($this->handle) {
- $this->currentLine = feof($this->handle) ? NULL : fgets($this->handle);
- $this->currentPos = ftell($this->handle);
- return $this->currentLine;
- }
- }
- public function valid() {
- return isset($this->currentLine);
- }
- public function current() {
- return $this->currentLine;
- }
- public function currentPos() {
- return $this->currentPos;
- }
- public function key() {
- return 'line';
- }
- }
- /**
- * Functionality to parse CSV files into a two dimensional array.
- */
- class ParserCSV {
- private $delimiter;
- private $fromEncoding;
- private $toEncoding;
- private $skipFirstLine;
- private $columnNames;
- private $timeout;
- private $timeoutReached;
- private $startByte;
- private $lineLimit;
- private $lastLinePos;
- private $useMbString;
- public function __construct() {
- $this->delimiter = ',';
- $this->fromEncoding = 'UTF-8';
- $this->toEncoding = 'UTF-8';
- $this->skipFirstLine = FALSE;
- $this->columnNames = FALSE;
- $this->timeout = FALSE;
- $this->timeoutReached = FALSE;
- $this->startByte = 0;
- $this->lineLimit = 0;
- $this->lastLinePos = 0;
- ini_set('auto_detect_line_endings', TRUE);
- if (extension_loaded('mbstring') && variable_get('feeds_use_mbstring', TRUE)) {
- $this->useMbString = TRUE;
- }
- }
- /**
- * Set the column delimiter string.
- * By default, the comma (',') is used as delimiter.
- */
- public function setDelimiter($delimiter) {
- $this->delimiter = $delimiter;
- }
- /**
- * Sets the source file encoding.
- *
- * By default, the encoding is UTF-8.
- *
- * @param string $encoding
- * The encoding to set.
- */
- public function setEncoding($encoding) {
- $this->fromEncoding = $encoding;
- }
- /**
- * Set this to TRUE if the parser should skip the first line of the CSV text,
- * which might be desired if the first line contains the column names.
- * By default, this is set to FALSE and the first line is not skipped.
- */
- public function setSkipFirstLine($skipFirstLine) {
- $this->skipFirstLine = $skipFirstLine;
- }
- /**
- * Specify an array of column names if you know them in advance, or FALSE
- * (which is the default) to unset any prior column names. If no column names
- * are set, the parser will put each row into a simple numerically indexed
- * array. If column names are given, the parser will create arrays with
- * these column names as array keys instead.
- */
- public function setColumnNames($columnNames) {
- $this->columnNames = $columnNames;
- }
- /**
- * Define the time (in milliseconds) after which the parser stops parsing,
- * even if it has not yet finished processing the CSV data. If the timeout
- * has been reached before parsing is done, the parse() method will return
- * an incomplete list of rows - a single row will never be cut off in the
- * middle, though. By default, no timeout (@p $timeout == FALSE) is defined.
- *
- * You can check if the timeout has been reached by calling the
- * timeoutReached() method after parse() has been called.
- */
- public function setTimeout($timeout) {
- $this->timeout = $timeout;
- }
- /**
- * After calling the parse() method, determine if the timeout (set by the
- * setTimeout() method) has been reached.
- *
- * @deprecated Use lastLinePos() instead to determine whether a file has
- * finished parsing.
- */
- public function timeoutReached() {
- return $this->timeoutReached;
- }
- /**
- * Define the number of lines to parse in one parsing operation.
- *
- * By default, all lines of a file are being parsed.
- */
- public function setLineLimit($lines) {
- $this->lineLimit = $lines;
- }
- /**
- * Get the byte number where the parser left off after last parse() call.
- *
- * @return
- * 0 if all lines or no line has been parsed, the byte position of where a
- * timeout or the line limit has been reached otherwise. This position can be
- * used to set the start byte for the next iteration after parse() has
- * reached the timeout set with setTimeout() or the line limit set with
- * setLineLimit().
- *
- * @see ParserCSV::setStartByte()
- */
- public function lastLinePos() {
- return $this->lastLinePos;
- }
- /**
- * Set the byte where file should be started to read.
- *
- * Useful when parsing a file in batches.
- */
- public function setStartByte($start) {
- return $this->startByte = $start;
- }
- /**
- * Parse CSV files into a two dimensional array.
- *
- * @param Iterator $lineIterator
- * An Iterator object that yields line strings, e.g. ParserCSVIterator.
- * @param $start
- * The byte number from where to start parsing the file.
- * @param $lines
- * The number of lines to parse, 0 for all lines.
- * @return
- * Two dimensional array that contains the data in the CSV file.
- */
- public function parse(Iterator $lineIterator) {
- $skipLine = $this->skipFirstLine;
- $rows = array();
- $this->timeoutReached = FALSE;
- $this->lastLinePos = 0;
- $maxTime = empty($this->timeout) ? FALSE : (microtime() + $this->timeout);
- $linesParsed = 0;
- for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) {
- // Make really sure we've got lines without trailing newlines.
- $line = trim($this->fixEncoding($lineIterator->current()), "\r\n");
- // Skip empty lines.
- if (empty($line)) {
- continue;
- }
- // If the first line contains column names, skip it.
- if ($skipLine) {
- $skipLine = FALSE;
- continue;
- }
- // The actual parser. explode() is unfortunately not suitable because the
- // delimiter might be located inside a quoted field, and that would break
- // the field and/or require additional effort to re-join the fields.
- $quoted = FALSE;
- $currentIndex = 0;
- $currentField = '';
- $fields = array();
- // We must use strlen() as we're parsing byte by byte using strpos(), so
- // drupal_strlen() will not work properly.
- while ($currentIndex <= strlen($line)) {
- if ($quoted) {
- $nextQuoteIndex = strpos($line, '"', $currentIndex);
- if ($nextQuoteIndex === FALSE) {
- // There's a line break before the quote is closed, so fetch the
- // next line and start from there.
- $currentField .= substr($line, $currentIndex);
- $lineIterator->next();
- if (!$lineIterator->valid()) {
- // Whoa, an unclosed quote! Well whatever, let's just ignore
- // that shortcoming and record it nevertheless.
- $fields[] = $currentField;
- break;
- }
- // Ok, so, on with fetching the next line, as mentioned above.
- $currentField .= "\n";
- $line = trim($this->fixEncoding($lineIterator->current()), "\r\n");
- $currentIndex = 0;
- continue;
- }
- // There's actually another quote in this line...
- // find out whether it's escaped or not.
- $currentField .= substr($line, $currentIndex, $nextQuoteIndex - $currentIndex);
- if (isset($line[$nextQuoteIndex + 1]) && $line[$nextQuoteIndex + 1] === '"') {
- // Escaped quote, add a single one to the field and proceed quoted.
- $currentField .= '"';
- $currentIndex = $nextQuoteIndex + 2;
- }
- else {
- // End of the quoted section, close the quote and let the
- // $quoted == FALSE block finalize the field.
- $quoted = FALSE;
- $currentIndex = $nextQuoteIndex + 1;
- }
- }
- else { // $quoted == FALSE
- // First, let's find out where the next character of interest is.
- $nextQuoteIndex = strpos($line, '"', $currentIndex);
- $nextDelimiterIndex = strpos($line, $this->delimiter, $currentIndex);
- if ($nextQuoteIndex === FALSE) {
- $nextIndex = $nextDelimiterIndex;
- }
- elseif ($nextDelimiterIndex === FALSE) {
- $nextIndex = $nextQuoteIndex;
- }
- else {
- $nextIndex = min($nextQuoteIndex, $nextDelimiterIndex);
- }
- if ($nextIndex === FALSE) {
- // This line is done, add the rest of it as last field.
- $currentField .= substr($line, $currentIndex);
- $fields[] = $currentField;
- break;
- }
- elseif ($line[$nextIndex] === $this->delimiter[0]) {
- $length = ($nextIndex + strlen($this->delimiter) - 1) - $currentIndex;
- $currentField .= substr($line, $currentIndex, $length);
- $fields[] = $currentField;
- $currentField = '';
- $currentIndex += $length + 1;
- // Continue with the next field.
- }
- else { // $line[$nextIndex] == '"'
- $quoted = TRUE;
- $currentField .= substr($line, $currentIndex, $nextIndex - $currentIndex);
- $currentIndex = $nextIndex + 1;
- // Continue this field in the $quoted == TRUE block.
- }
- }
- }
- // End of CSV parser. We've now got all the fields of the line as strings
- // in the $fields array.
- if (empty($this->columnNames)) {
- $row = $fields;
- }
- else {
- $row = array();
- foreach ($this->columnNames as $columnName) {
- $field = array_shift($fields);
- $row[$columnName] = isset($field) ? $field : '';
- }
- }
- $rows[] = $row;
- // Quit parsing if timeout has been reached or requested lines have been
- // reached.
- if (!empty($maxTime) && microtime() > $maxTime) {
- $this->timeoutReached = TRUE;
- $this->lastLinePos = $lineIterator->currentPos();
- break;
- }
- $linesParsed++;
- if ($this->lineLimit && $linesParsed >= $this->lineLimit) {
- $this->lastLinePos = $lineIterator->currentPos();
- break;
- }
- }
- return $rows;
- }
- /**
- * Converts encoding of input data.
- *
- * @param string $data
- * A chunk of data.
- *
- * @return string
- * The encoded data.
- *
- * @throws ParserCSVEncodingException
- * Thrown when a given encoding does not match.
- */
- public function fixEncoding($data) {
- if ($this->useMbString) {
- if (mb_check_encoding($data, $this->fromEncoding)) {
- if ($this->toEncoding != $this->fromEncoding) {
- // Convert encoding. The conversion is to UTF-8 by default to prevent
- // SQL errors.
- $data = mb_convert_encoding($data, $this->toEncoding, $this->fromEncoding);
- }
- }
- else {
- throw new ParserCSVEncodingException(t('Source file is not in %encoding encoding.', array('%encoding' => $this->fromEncoding)));
- }
- }
- return $data;
- }
- }
- /**
- * Exception thrown when an encoding error occurs during parsing.
- */
- class ParserCSVEncodingException extends Exception {}
|