tcpdf_parser.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. <?php
  2. //============================================================+
  3. // File name : tcpdf_parser.php
  4. // Version : 1.0.000
  5. // Begin : 2011-05-23
  6. // Last Update : 2011-07-14
  7. // Author : Nicola Asuni - Tecnick.com S.r.l - Via Della Pace, 11 - 09044 - Quartucciu (CA) - ITALY - www.tecnick.com - info@tecnick.com
  8. // License : http://www.tecnick.com/pagefiles/tcpdf/LICENSE.TXT GNU-LGPLv3 + YOU CAN'T REMOVE ANY TCPDF COPYRIGHT NOTICE OR LINK FROM THE GENERATED PDF DOCUMENTS.
  9. // -------------------------------------------------------------------
  10. // Copyright (C) 2011-2011 Nicola Asuni - Tecnick.com S.r.l.
  11. //
  12. // This file is part of TCPDF software library.
  13. //
  14. // TCPDF is free software: you can redistribute it and/or modify it
  15. // under the terms of the GNU Lesser General Public License as
  16. // published by the Free Software Foundation, either version 3 of the
  17. // License, or (at your option) any later version. Additionally,
  18. // YOU CAN'T REMOVE ANY TCPDF COPYRIGHT NOTICE OR LINK FROM THE
  19. // GENERATED PDF DOCUMENTS.
  20. //
  21. // TCPDF is distributed in the hope that it will be useful, but
  22. // WITHOUT ANY WARRANTY; without even the implied warranty of
  23. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  24. // See the GNU Lesser General Public License for more details.
  25. //
  26. // You should have received a copy of the License
  27. // along with TCPDF. If not, see
  28. // <http://www.tecnick.com/pagefiles/tcpdf/LICENSE.TXT>.
  29. //
  30. // See LICENSE.TXT file for more information.
  31. // -------------------------------------------------------------------
  32. //
  33. // Description : This is a PHP class for parsing PDF documents.
  34. //
  35. //============================================================+
  36. /**
  37. * @file
  38. * This is a PHP class for parsing PDF documents.<br>
  39. * @package com.tecnick.tcpdf
  40. * @author Nicola Asuni
  41. * @version 1.0.000
  42. */
  43. // include class for decoding filters
  44. require_once(dirname(__FILE__).'/tcpdf_filters.php');
  45. /**
  46. * @class TCPDF_PARSER
  47. * This is a PHP class for parsing PDF documents.<br>
  48. * @package com.tecnick.tcpdf
  49. * @brief This is a PHP class for parsing PDF documents..
  50. * @version 1.0.000
  51. * @author Nicola Asuni - info@tecnick.com
  52. */
  53. class TCPDF_PARSER {
  54. /**
  55. * Raw content of the PDF document.
  56. * @private
  57. */
  58. private $pdfdata = '';
  59. /**
  60. * XREF data.
  61. * @protected
  62. */
  63. protected $xref = array();
  64. /**
  65. * Array of PDF objects.
  66. * @protected
  67. */
  68. protected $objects = array();
  69. /**
  70. * Class object for decoding filters.
  71. * @private
  72. */
  73. private $FilterDecoders;
  74. // -----------------------------------------------------------------------------
  75. /**
  76. * Parse a PDF document an return an array of objects.
  77. * @param $data (string) PDF data to parse.
  78. * @public
  79. * @since 1.0.000 (2011-05-24)
  80. */
  81. public function __construct($data) {
  82. if (empty($data)) {
  83. $this->Error('Empty PDF data.');
  84. }
  85. $this->pdfdata = $data;
  86. // get length
  87. $pdflen = strlen($this->pdfdata);
  88. // initialize class for decoding filters
  89. $this->FilterDecoders = new TCPDF_FILTERS();
  90. // get xref and trailer data
  91. $this->xref = $this->getXrefData();
  92. // parse all document objects
  93. $this->objects = array();
  94. foreach ($this->xref['xref'] as $obj => $offset) {
  95. if (!isset($this->objects[$obj])) {
  96. $this->objects[$obj] = $this->getIndirectObject($obj, $offset, true);
  97. }
  98. }
  99. // release some memory
  100. unset($this->pdfdata);
  101. $this->pdfdata = '';
  102. }
  103. /**
  104. * Return an array of parsed PDF document objects.
  105. * @return (array) Array of parsed PDF document objects.
  106. * @public
  107. * @since 1.0.000 (2011-06-26)
  108. */
  109. public function getParsedData() {
  110. return array($this->xref, $this->objects);
  111. }
  112. /**
  113. * Get xref (cross-reference table) and trailer data from PDF document data.
  114. * @param $offset (int) xref offset (if know).
  115. * @param $xref (array) previous xref array (if any).
  116. * @return Array containing xref and trailer data.
  117. * @protected
  118. * @since 1.0.000 (2011-05-24)
  119. */
  120. protected function getXrefData($offset=0, $xref=array()) {
  121. // find last startxref
  122. if (preg_match_all('/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_SET_ORDER, $offset) == 0) {
  123. $this->Error('Unable to find startxref');
  124. }
  125. $matches = array_pop($matches);
  126. $startxref = $matches[1];
  127. // check xref position
  128. if (strpos($this->pdfdata, 'xref', $startxref) != $startxref) {
  129. $this->Error('Unable to find xref');
  130. }
  131. // extract xref data (object indexes and offsets)
  132. $offset = $startxref + 5;
  133. // initialize object number
  134. $obj_num = 0;
  135. while (preg_match('/^([0-9]+)[\s]([0-9]+)[\s]?([nf]?)/im', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
  136. $offset = (strlen($matches[0][0]) + $matches[0][1]);
  137. if ($matches[3][0] == 'n') {
  138. // create unique object index: [object number]_[generation number]
  139. $index = $obj_num.'_'.intval($matches[2][0]);
  140. // check if object already exist
  141. if (!isset($xref['xref'][$index])) {
  142. // store object offset position
  143. $xref['xref'][$index] = intval($matches[1][0]);
  144. }
  145. ++$obj_num;
  146. $offset += 2;
  147. } elseif ($matches[3][0] == 'f') {
  148. ++$obj_num;
  149. $offset += 2;
  150. } else {
  151. // object number (index)
  152. $obj_num = intval($matches[1][0]);
  153. }
  154. }
  155. // get trailer data
  156. if (preg_match('/trailer[\s]*<<(.*)>>[\s]*[\r\n]+startxref[\s]*[\r\n]+/isU', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
  157. $trailer_data = $matches[1][0];
  158. if (!isset($xref['trailer'])) {
  159. // get only the last updated version
  160. $xref['trailer'] = array();
  161. // parse trailer_data
  162. if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
  163. $xref['trailer']['size'] = intval($matches[1]);
  164. }
  165. if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
  166. $xref['trailer']['root'] = intval($matches[1]).'_'.intval($matches[2]);
  167. }
  168. if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
  169. $xref['trailer']['encrypt'] = intval($matches[1]).'_'.intval($matches[2]);
  170. }
  171. if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
  172. $xref['trailer']['info'] = intval($matches[1]).'_'.intval($matches[2]);
  173. }
  174. if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
  175. $xref['trailer']['id'] = array();
  176. $xref['trailer']['id'][0] = $matches[1];
  177. $xref['trailer']['id'][1] = $matches[2];
  178. }
  179. }
  180. if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
  181. // get previous xref
  182. $xref = getXrefData(substr($this->pdfdata, 0, $startxref), intval($matches[1]), $xref);
  183. }
  184. } else {
  185. $this->Error('Unable to find trailer');
  186. }
  187. return $xref;
  188. }
  189. /**
  190. * Get object type, raw value and offset to next object
  191. * @param $offset (int) Object offset.
  192. * @return array containing object type, raw value and offset to next object
  193. * @protected
  194. * @since 1.0.000 (2011-06-20)
  195. */
  196. protected function getRawObject($offset=0) {
  197. $objtype = ''; // object type to be returned
  198. $objval = ''; // object value to be returned
  199. // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
  200. $offset += strspn($this->pdfdata, "\x00\x09\x0a\x0c\x0d\x20", $offset);
  201. // get first char
  202. $char = $this->pdfdata{$offset};
  203. // get object type
  204. switch ($char) {
  205. case '%': { // \x25 PERCENT SIGN
  206. // skip comment and search for next token
  207. $next = strcspn($this->pdfdata, "\r\n", $offset);
  208. if ($next > 0) {
  209. $offset += $next;
  210. return $this->getRawObject($this->pdfdata, $offset);
  211. }
  212. break;
  213. }
  214. case '/': { // \x2F SOLIDUS
  215. // name object
  216. $objtype = $char;
  217. ++$offset;
  218. if (preg_match('/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/', substr($this->pdfdata, $offset, 256), $matches) == 1) {
  219. $objval = $matches[1]; // unescaped value
  220. $offset += strlen($objval);
  221. }
  222. break;
  223. }
  224. case '(': // \x28 LEFT PARENTHESIS
  225. case ')': { // \x29 RIGHT PARENTHESIS
  226. // literal string object
  227. $objtype = $char;
  228. ++$offset;
  229. $strpos = $offset;
  230. if ($char == '(') {
  231. $open_bracket = 1;
  232. while ($open_bracket > 0) {
  233. if (!isset($this->pdfdata{$strpos})) {
  234. break;
  235. }
  236. $ch = $this->pdfdata{$strpos};
  237. switch ($ch) {
  238. case '\\': { // REVERSE SOLIDUS (5Ch) (Backslash)
  239. // skip next character
  240. ++$strpos;
  241. break;
  242. }
  243. case '(': { // LEFT PARENHESIS (28h)
  244. ++$open_bracket;
  245. break;
  246. }
  247. case ')': { // RIGHT PARENTHESIS (29h)
  248. --$open_bracket;
  249. break;
  250. }
  251. }
  252. ++$strpos;
  253. }
  254. $objval = substr($this->pdfdata, $offset, ($strpos - $offset - 1));
  255. $offset = $strpos;
  256. }
  257. break;
  258. }
  259. case '[': // \x5B LEFT SQUARE BRACKET
  260. case ']': { // \x5D RIGHT SQUARE BRACKET
  261. // array object
  262. $objtype = $char;
  263. ++$offset;
  264. if ($char == '[') {
  265. // get array content
  266. $objval = array();
  267. do {
  268. // get element
  269. $element = $this->getRawObject($offset);
  270. $offset = $element[2];
  271. $objval[] = $element;
  272. } while ($element[0] != ']');
  273. // remove closing delimiter
  274. array_pop($objval);
  275. }
  276. break;
  277. }
  278. case '<': // \x3C LESS-THAN SIGN
  279. case '>': { // \x3E GREATER-THAN SIGN
  280. if (isset($this->pdfdata{($offset + 1)}) AND ($this->pdfdata{($offset + 1)} == $char)) {
  281. // dictionary object
  282. $objtype = $char.$char;
  283. $offset += 2;
  284. if ($char == '<') {
  285. // get array content
  286. $objval = array();
  287. do {
  288. // get element
  289. $element = $this->getRawObject($offset);
  290. $offset = $element[2];
  291. $objval[] = $element;
  292. } while ($element[0] != '>>');
  293. // remove closing delimiter
  294. array_pop($objval);
  295. }
  296. } else {
  297. // hexadecimal string object
  298. $objtype = $char;
  299. ++$offset;
  300. if (($char == '<') AND (preg_match('/^([0-9A-Fa-f]+)[>]/iU', substr($this->pdfdata, $offset), $matches) == 1)) {
  301. $objval = $matches[1];
  302. $offset += strlen($matches[0]);
  303. }
  304. }
  305. break;
  306. }
  307. default: {
  308. if (substr($this->pdfdata, $offset, 6) == 'endobj') {
  309. // indirect object
  310. $objtype = 'endobj';
  311. $offset += 6;
  312. } elseif (substr($this->pdfdata, $offset, 4) == 'null') {
  313. // null object
  314. $objtype = 'null';
  315. $offset += 4;
  316. $objval = 'null';
  317. } elseif (substr($this->pdfdata, $offset, 4) == 'true') {
  318. // boolean true object
  319. $objtype = 'boolean';
  320. $offset += 4;
  321. $objval = 'true';
  322. } elseif (substr($this->pdfdata, $offset, 5) == 'false') {
  323. // boolean false object
  324. $objtype = 'boolean';
  325. $offset += 5;
  326. $objval = 'false';
  327. } elseif (substr($this->pdfdata, $offset, 6) == 'stream') {
  328. // start stream object
  329. $objtype = 'stream';
  330. $offset += 6;
  331. if (preg_match('/^[\r\n]+(.*)[\r\n]*endstream/isU', substr($this->pdfdata, $offset), $matches) == 1) {
  332. $objval = $matches[1];
  333. $offset += strlen($matches[0]);
  334. }
  335. } elseif (substr($this->pdfdata, $offset, 9) == 'endstream') {
  336. // end stream object
  337. $objtype = 'endstream';
  338. $offset += 9;
  339. } elseif (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($this->pdfdata, $offset, 33), $matches) == 1) {
  340. // indirect object reference
  341. $objtype = 'ojbref';
  342. $offset += strlen($matches[0]);
  343. $objval = intval($matches[1]).'_'.intval($matches[2]);
  344. } elseif (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($this->pdfdata, $offset, 33), $matches) == 1) {
  345. // object start
  346. $objtype = 'ojb';
  347. $objval = intval($matches[1]).'_'.intval($matches[2]);
  348. $offset += strlen ($matches[0]);
  349. } elseif (($numlen = strspn($this->pdfdata, '+-.0123456789', $offset)) > 0) {
  350. // numeric object
  351. $objtype = 'numeric';
  352. $objval = substr($this->pdfdata, $offset, $numlen);
  353. $offset += $numlen;
  354. }
  355. break;
  356. }
  357. }
  358. return array($objtype, $objval, $offset);
  359. }
  360. /**
  361. * Get content of indirect object.
  362. * @param $obj_ref (string) Object number and generation number separated by underscore character.
  363. * @param $offset (int) Object offset.
  364. * @param $decoding (boolean) If true decode streams.
  365. * @return array containing object data.
  366. * @protected
  367. * @since 1.0.000 (2011-05-24)
  368. */
  369. protected function getIndirectObject($obj_ref, $offset=0, $decoding=true) {
  370. $obj = explode('_', $obj_ref);
  371. if (($obj === false) OR (count($obj) != 2)) {
  372. $this->Error('Invalid object reference: '.$obj);
  373. return;
  374. }
  375. $objref = $obj[0].' '.$obj[1].' obj';
  376. if (strpos($this->pdfdata, $objref, $offset) != $offset) {
  377. // an indirect reference to an undefined object shall be considered a reference to the null object
  378. return array('null', 'null', $offset);
  379. }
  380. // starting position of object content
  381. $offset += strlen($objref);
  382. // get array of object content
  383. $objdata = array();
  384. $i = 0; // object main index
  385. do {
  386. // get element
  387. $element = $this->getRawObject($offset);
  388. $offset = $element[2];
  389. // decode stream using stream's dictionary information
  390. if ($decoding AND ($element[0] == 'stream') AND (isset($objdata[($i - 1)][0])) AND ($objdata[($i - 1)][0] == '<<')) {
  391. $element[3] = $this->decodeStream($objdata[($i - 1)][1], $element[1]);
  392. }
  393. $objdata[$i] = $element;
  394. ++$i;
  395. } while ($element[0] != 'endobj');
  396. // remove closing delimiter
  397. array_pop($objdata);
  398. // return raw object content
  399. return $objdata;
  400. }
  401. /**
  402. * Get the content of object, resolving indect object reference if necessary.
  403. * @param $obj (string) Object value.
  404. * @return array containing object data.
  405. * @protected
  406. * @since 1.0.000 (2011-06-26)
  407. */
  408. protected function getObjectVal($obj) {
  409. if ($obj[0] == 'objref') {
  410. // reference to indirect object
  411. if (isset($this->objects[$obj[1]])) {
  412. // this object has been already parsed
  413. return $this->objects[$obj[1]];
  414. } elseif (isset($this->xref[$obj[1]])) {
  415. // parse new object
  416. $this->objects[$obj[1]] = $this->getIndirectObject($obj[1], $this->xref[$obj[1]], false);
  417. return $this->objects[$obj[1]];
  418. }
  419. }
  420. return $obj;
  421. }
  422. /**
  423. * Decode the specified stream.
  424. * @param $sdic (array) Stream's dictionary array.
  425. * @param $stream (string) Stream to decode.
  426. * @return array containing decoded stream data and remaining filters.
  427. * @protected
  428. * @since 1.0.000 (2011-06-22)
  429. */
  430. protected function decodeStream($sdic, $stream) {
  431. // get stream lenght and filters
  432. $slength = strlen($stream);
  433. $filters = array();
  434. foreach ($sdic as $k => $v) {
  435. if ($v[0] == '/') {
  436. if (($v[1] == 'Length') AND (isset($sdic[($k + 1)])) AND ($sdic[($k + 1)][0] == 'numeric')) {
  437. // get declared stream lenght
  438. $declength = intval($sdic[($k + 1)][1]);
  439. if ($declength < $slength) {
  440. $stream = substr($stream, 0, $declength);
  441. $slength = $declength;
  442. }
  443. } elseif (($v[1] == 'Filter') AND (isset($sdic[($k + 1)]))) {
  444. // resolve indirect object
  445. $objval = $this->getObjectVal($sdic[($k + 1)]);
  446. if ($objval[0] == '/') {
  447. // single filter
  448. $filters[] = $objval[1];
  449. } elseif ($objval[0] == '[') {
  450. // array of filters
  451. foreach ($objval[1] as $flt) {
  452. if ($flt[0] == '/') {
  453. $filters[] = $flt[1];
  454. }
  455. }
  456. }
  457. }
  458. }
  459. }
  460. // decode the stream
  461. $remaining_filters = array();
  462. foreach ($filters as $filter) {
  463. if (in_array($filter, $this->FilterDecoders->getAvailableFilters())) {
  464. $stream = $this->FilterDecoders->decodeFilter($filter, $stream);
  465. } else {
  466. // add missing filter to array
  467. $remaining_filters[] = $filter;
  468. }
  469. }
  470. return array($stream, $remaining_filters);
  471. }
  472. /**
  473. * This method is automatically called in case of fatal error; it simply outputs the message and halts the execution.
  474. * @param $msg (string) The error message
  475. * @public
  476. * @since 1.0.000 (2011-05-23)
  477. */
  478. public function Error($msg) {
  479. // exit program and print error
  480. die('<strong>TCPDF_PARSER ERROR: </strong>'.$msg);
  481. }
  482. } // END OF TCPDF_PARSER CLASS
  483. //============================================================+
  484. // END OF FILE
  485. //============================================================+