ParserCSV.inc 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. <?php
  2. /**
  3. * @file
  4. * Contains CSV Parser.
  5. *
  6. * Functions in this file are independent of the Feeds specific implementation.
  7. * Thanks to jpetso http://drupal.org/user/56020 for most of the code in this
  8. * file.
  9. */
  10. /**
  11. * Text lines from file iterator.
  12. */
  13. class ParserCSVIterator implements Iterator {
  14. private $handle;
  15. private $currentLine;
  16. private $currentPos;
  17. public function __construct($filepath) {
  18. $this->handle = fopen($filepath, 'r');
  19. $this->currentLine = NULL;
  20. $this->currentPos = NULL;
  21. }
  22. function __destruct() {
  23. if ($this->handle) {
  24. fclose($this->handle);
  25. }
  26. }
  27. public function rewind($pos = 0) {
  28. if ($this->handle) {
  29. fseek($this->handle, $pos);
  30. $this->next();
  31. }
  32. }
  33. public function next() {
  34. if ($this->handle) {
  35. $this->currentLine = feof($this->handle) ? NULL : fgets($this->handle);
  36. $this->currentPos = ftell($this->handle);
  37. return $this->currentLine;
  38. }
  39. }
  40. public function valid() {
  41. return isset($this->currentLine);
  42. }
  43. public function current() {
  44. return $this->currentLine;
  45. }
  46. public function currentPos() {
  47. return $this->currentPos;
  48. }
  49. public function key() {
  50. return 'line';
  51. }
  52. }
  53. /**
  54. * Functionality to parse CSV files into a two dimensional array.
  55. */
  56. class ParserCSV {
  57. private $delimiter;
  58. private $fromEncoding;
  59. private $toEncoding;
  60. private $skipFirstLine;
  61. private $columnNames;
  62. private $timeout;
  63. private $timeoutReached;
  64. private $startByte;
  65. private $lineLimit;
  66. private $lastLinePos;
  67. private $useMbString;
  68. public function __construct() {
  69. $this->delimiter = ',';
  70. $this->fromEncoding = 'UTF-8';
  71. $this->toEncoding = 'UTF-8';
  72. $this->skipFirstLine = FALSE;
  73. $this->columnNames = FALSE;
  74. $this->timeout = FALSE;
  75. $this->timeoutReached = FALSE;
  76. $this->startByte = 0;
  77. $this->lineLimit = 0;
  78. $this->lastLinePos = 0;
  79. ini_set('auto_detect_line_endings', TRUE);
  80. if (extension_loaded('mbstring') && variable_get('feeds_use_mbstring', TRUE)) {
  81. $this->useMbString = TRUE;
  82. }
  83. }
  84. /**
  85. * Set the column delimiter string.
  86. * By default, the comma (',') is used as delimiter.
  87. */
  88. public function setDelimiter($delimiter) {
  89. $this->delimiter = $delimiter;
  90. }
  91. /**
  92. * Sets the source file encoding.
  93. *
  94. * By default, the encoding is UTF-8.
  95. *
  96. * @param string $encoding
  97. * The encoding to set.
  98. */
  99. public function setEncoding($encoding) {
  100. $this->fromEncoding = $encoding;
  101. }
  102. /**
  103. * Set this to TRUE if the parser should skip the first line of the CSV text,
  104. * which might be desired if the first line contains the column names.
  105. * By default, this is set to FALSE and the first line is not skipped.
  106. */
  107. public function setSkipFirstLine($skipFirstLine) {
  108. $this->skipFirstLine = $skipFirstLine;
  109. }
  110. /**
  111. * Specify an array of column names if you know them in advance, or FALSE
  112. * (which is the default) to unset any prior column names. If no column names
  113. * are set, the parser will put each row into a simple numerically indexed
  114. * array. If column names are given, the parser will create arrays with
  115. * these column names as array keys instead.
  116. */
  117. public function setColumnNames($columnNames) {
  118. $this->columnNames = $columnNames;
  119. }
  120. /**
  121. * Define the time (in milliseconds) after which the parser stops parsing,
  122. * even if it has not yet finished processing the CSV data. If the timeout
  123. * has been reached before parsing is done, the parse() method will return
  124. * an incomplete list of rows - a single row will never be cut off in the
  125. * middle, though. By default, no timeout (@p $timeout == FALSE) is defined.
  126. *
  127. * You can check if the timeout has been reached by calling the
  128. * timeoutReached() method after parse() has been called.
  129. */
  130. public function setTimeout($timeout) {
  131. $this->timeout = $timeout;
  132. }
  133. /**
  134. * After calling the parse() method, determine if the timeout (set by the
  135. * setTimeout() method) has been reached.
  136. *
  137. * @deprecated Use lastLinePos() instead to determine whether a file has
  138. * finished parsing.
  139. */
  140. public function timeoutReached() {
  141. return $this->timeoutReached;
  142. }
  143. /**
  144. * Define the number of lines to parse in one parsing operation.
  145. *
  146. * By default, all lines of a file are being parsed.
  147. */
  148. public function setLineLimit($lines) {
  149. $this->lineLimit = $lines;
  150. }
  151. /**
  152. * Get the byte number where the parser left off after last parse() call.
  153. *
  154. * @return
  155. * 0 if all lines or no line has been parsed, the byte position of where a
  156. * timeout or the line limit has been reached otherwise. This position can be
  157. * used to set the start byte for the next iteration after parse() has
  158. * reached the timeout set with setTimeout() or the line limit set with
  159. * setLineLimit().
  160. *
  161. * @see ParserCSV::setStartByte()
  162. */
  163. public function lastLinePos() {
  164. return $this->lastLinePos;
  165. }
  166. /**
  167. * Set the byte where file should be started to read.
  168. *
  169. * Useful when parsing a file in batches.
  170. */
  171. public function setStartByte($start) {
  172. return $this->startByte = $start;
  173. }
  174. /**
  175. * Parse CSV files into a two dimensional array.
  176. *
  177. * @param Iterator $lineIterator
  178. * An Iterator object that yields line strings, e.g. ParserCSVIterator.
  179. * @param $start
  180. * The byte number from where to start parsing the file.
  181. * @param $lines
  182. * The number of lines to parse, 0 for all lines.
  183. * @return
  184. * Two dimensional array that contains the data in the CSV file.
  185. */
  186. public function parse(Iterator $lineIterator) {
  187. $skipLine = $this->skipFirstLine;
  188. $rows = array();
  189. $this->timeoutReached = FALSE;
  190. $this->lastLinePos = 0;
  191. $maxTime = empty($this->timeout) ? FALSE : (microtime() + $this->timeout);
  192. $linesParsed = 0;
  193. for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) {
  194. // Make really sure we've got lines without trailing newlines.
  195. $line = trim($this->fixEncoding($lineIterator->current()), "\r\n");
  196. // Skip empty lines.
  197. if (empty($line)) {
  198. continue;
  199. }
  200. // If the first line contains column names, skip it.
  201. if ($skipLine) {
  202. $skipLine = FALSE;
  203. continue;
  204. }
  205. // The actual parser. explode() is unfortunately not suitable because the
  206. // delimiter might be located inside a quoted field, and that would break
  207. // the field and/or require additional effort to re-join the fields.
  208. $quoted = FALSE;
  209. $currentIndex = 0;
  210. $currentField = '';
  211. $fields = array();
  212. // We must use strlen() as we're parsing byte by byte using strpos(), so
  213. // drupal_strlen() will not work properly.
  214. while ($currentIndex <= strlen($line)) {
  215. if ($quoted) {
  216. $nextQuoteIndex = strpos($line, '"', $currentIndex);
  217. if ($nextQuoteIndex === FALSE) {
  218. // There's a line break before the quote is closed, so fetch the
  219. // next line and start from there.
  220. $currentField .= substr($line, $currentIndex);
  221. $lineIterator->next();
  222. if (!$lineIterator->valid()) {
  223. // Whoa, an unclosed quote! Well whatever, let's just ignore
  224. // that shortcoming and record it nevertheless.
  225. $fields[] = $currentField;
  226. break;
  227. }
  228. // Ok, so, on with fetching the next line, as mentioned above.
  229. $currentField .= "\n";
  230. $line = trim($this->fixEncoding($lineIterator->current()), "\r\n");
  231. $currentIndex = 0;
  232. continue;
  233. }
  234. // There's actually another quote in this line...
  235. // find out whether it's escaped or not.
  236. $currentField .= substr($line, $currentIndex, $nextQuoteIndex - $currentIndex);
  237. if (isset($line[$nextQuoteIndex + 1]) && $line[$nextQuoteIndex + 1] === '"') {
  238. // Escaped quote, add a single one to the field and proceed quoted.
  239. $currentField .= '"';
  240. $currentIndex = $nextQuoteIndex + 2;
  241. }
  242. else {
  243. // End of the quoted section, close the quote and let the
  244. // $quoted == FALSE block finalize the field.
  245. $quoted = FALSE;
  246. $currentIndex = $nextQuoteIndex + 1;
  247. }
  248. }
  249. else { // $quoted == FALSE
  250. // First, let's find out where the next character of interest is.
  251. $nextQuoteIndex = strpos($line, '"', $currentIndex);
  252. $nextDelimiterIndex = strpos($line, $this->delimiter, $currentIndex);
  253. if ($nextQuoteIndex === FALSE) {
  254. $nextIndex = $nextDelimiterIndex;
  255. }
  256. elseif ($nextDelimiterIndex === FALSE) {
  257. $nextIndex = $nextQuoteIndex;
  258. }
  259. else {
  260. $nextIndex = min($nextQuoteIndex, $nextDelimiterIndex);
  261. }
  262. if ($nextIndex === FALSE) {
  263. // This line is done, add the rest of it as last field.
  264. $currentField .= substr($line, $currentIndex);
  265. $fields[] = $currentField;
  266. break;
  267. }
  268. elseif ($line[$nextIndex] === $this->delimiter[0]) {
  269. $length = ($nextIndex + strlen($this->delimiter) - 1) - $currentIndex;
  270. $currentField .= substr($line, $currentIndex, $length);
  271. $fields[] = $currentField;
  272. $currentField = '';
  273. $currentIndex += $length + 1;
  274. // Continue with the next field.
  275. }
  276. else { // $line[$nextIndex] == '"'
  277. $quoted = TRUE;
  278. $currentField .= substr($line, $currentIndex, $nextIndex - $currentIndex);
  279. $currentIndex = $nextIndex + 1;
  280. // Continue this field in the $quoted == TRUE block.
  281. }
  282. }
  283. }
  284. // End of CSV parser. We've now got all the fields of the line as strings
  285. // in the $fields array.
  286. if (empty($this->columnNames)) {
  287. $row = $fields;
  288. }
  289. else {
  290. $row = array();
  291. foreach ($this->columnNames as $columnName) {
  292. $field = array_shift($fields);
  293. $row[$columnName] = isset($field) ? $field : '';
  294. }
  295. }
  296. $rows[] = $row;
  297. // Quit parsing if timeout has been reached or requested lines have been
  298. // reached.
  299. if (!empty($maxTime) && microtime() > $maxTime) {
  300. $this->timeoutReached = TRUE;
  301. $this->lastLinePos = $lineIterator->currentPos();
  302. break;
  303. }
  304. $linesParsed++;
  305. if ($this->lineLimit && $linesParsed >= $this->lineLimit) {
  306. $this->lastLinePos = $lineIterator->currentPos();
  307. break;
  308. }
  309. }
  310. return $rows;
  311. }
  312. /**
  313. * Converts encoding of input data.
  314. *
  315. * @param string $data
  316. * A chunk of data.
  317. *
  318. * @return string
  319. * The encoded data.
  320. *
  321. * @throws ParserCSVEncodingException
  322. * Thrown when a given encoding does not match.
  323. */
  324. public function fixEncoding($data) {
  325. if ($this->useMbString) {
  326. if (mb_check_encoding($data, $this->fromEncoding)) {
  327. if ($this->toEncoding != $this->fromEncoding) {
  328. // Convert encoding. The conversion is to UTF-8 by default to prevent
  329. // SQL errors.
  330. $data = mb_convert_encoding($data, $this->toEncoding, $this->fromEncoding);
  331. }
  332. }
  333. else {
  334. throw new ParserCSVEncodingException(t('Source file is not in %encoding encoding.', array('%encoding' => $this->fromEncoding)));
  335. }
  336. }
  337. return $data;
  338. }
  339. }
  340. /**
  341. * Exception thrown when an encoding error occurs during parsing.
  342. */
  343. class ParserCSVEncodingException extends Exception {}