pdf_parser.php 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690
  1. <?php
  2. //
  3. // FPDI - Version 1.2
  4. //
  5. // Copyright 2004-2007 Setasign - Jan Slabon
  6. //
  7. // Licensed under the Apache License, Version 2.0 (the "License");
  8. // you may not use this file except in compliance with the License.
  9. // You may obtain a copy of the License at
  10. //
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. //
  13. // Unless required by applicable law or agreed to in writing, software
  14. // distributed under the License is distributed on an "AS IS" BASIS,
  15. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. // See the License for the specific language governing permissions and
  17. // limitations under the License.
  18. //
  19. if (!defined ('PDF_TYPE_NULL'))
  20. define ('PDF_TYPE_NULL', 0);
  21. if (!defined ('PDF_TYPE_NUMERIC'))
  22. define ('PDF_TYPE_NUMERIC', 1);
  23. if (!defined ('PDF_TYPE_TOKEN'))
  24. define ('PDF_TYPE_TOKEN', 2);
  25. if (!defined ('PDF_TYPE_HEX'))
  26. define ('PDF_TYPE_HEX', 3);
  27. if (!defined ('PDF_TYPE_STRING'))
  28. define ('PDF_TYPE_STRING', 4);
  29. if (!defined ('PDF_TYPE_DICTIONARY'))
  30. define ('PDF_TYPE_DICTIONARY', 5);
  31. if (!defined ('PDF_TYPE_ARRAY'))
  32. define ('PDF_TYPE_ARRAY', 6);
  33. if (!defined ('PDF_TYPE_OBJDEC'))
  34. define ('PDF_TYPE_OBJDEC', 7);
  35. if (!defined ('PDF_TYPE_OBJREF'))
  36. define ('PDF_TYPE_OBJREF', 8);
  37. if (!defined ('PDF_TYPE_OBJECT'))
  38. define ('PDF_TYPE_OBJECT', 9);
  39. if (!defined ('PDF_TYPE_STREAM'))
  40. define ('PDF_TYPE_STREAM', 10);
  41. class pdf_parser {
  42. /**
  43. * Filename
  44. * @var string
  45. */
  46. var $filename;
  47. /**
  48. * File resource
  49. * @var resource
  50. */
  51. var $f;
  52. /**
  53. * PDF Context
  54. * @var object pdf_context-Instance
  55. */
  56. var $c;
  57. /**
  58. * xref-Data
  59. * @var array
  60. */
  61. var $xref;
  62. /**
  63. * root-Object
  64. * @var array
  65. */
  66. var $root;
  67. // mPDF 4.0 Added flag to show success on loading file
  68. var $success;
  69. var $errormsg;
  70. /**
  71. * Constructor
  72. *
  73. * @param string $filename Source-Filename
  74. */
  75. function pdf_parser($filename) {
  76. $this->filename = $filename;
  77. // mPDF 4.0
  78. $this->success = true;
  79. $this->f = @fopen($this->filename, "rb");
  80. if (!$this->f) {
  81. $this->success = false;
  82. $this->errormsg = sprintf("Cannot open %s !", $filename);
  83. return false;
  84. }
  85. // mPDF 5.0 Removed pass by reference =&
  86. $this->c = new pdf_context($this->f);
  87. // Read xref-Data
  88. $offset = $this->pdf_find_xref();
  89. if ($offset===false) {
  90. $this->success = false;
  91. $this->errormsg = sprintf("Cannot open %s !", $filename);
  92. return false;
  93. }
  94. $this->pdf_read_xref($this->xref, $offset);
  95. if ($this->success == false) { return false; }
  96. // Check for Encryption
  97. $this->getEncryption();
  98. if ($this->success == false) { return false; }
  99. // Read root
  100. $this->pdf_read_root();
  101. if ($this->success == false) { return false; }
  102. }
  103. /**
  104. * Close the opened file
  105. */
  106. function closeFile() {
  107. if (isset($this->f)) {
  108. fclose($this->f);
  109. unset($this->f);
  110. }
  111. }
  112. /**
  113. * Print Error and die
  114. *
  115. * @param string $msg Error-Message
  116. */
  117. function error($msg) {
  118. die("<b>PDF-Parser Error:</b> ".$msg);
  119. }
  120. /**
  121. * Check Trailer for Encryption
  122. */
  123. function getEncryption() {
  124. if (isset($this->xref['trailer'][1]['/Encrypt'])) {
  125. // mPDF 4.0
  126. $this->success = false;
  127. $this->errormsg = sprintf("File is encrypted!");
  128. return false;
  129. }
  130. }
  131. /**
  132. * Find/Return /Root
  133. *
  134. * @return array
  135. */
  136. function pdf_find_root() {
  137. if ($this->xref['trailer'][1]['/Root'][0] != PDF_TYPE_OBJREF) {
  138. // mPDF 4.0
  139. $this->success = false;
  140. $this->errormsg = sprintf("Wrong Type of Root-Element! Must be an indirect reference");
  141. return false;
  142. }
  143. return $this->xref['trailer'][1]['/Root'];
  144. }
  145. /**
  146. * Read the /Root
  147. */
  148. function pdf_read_root() {
  149. // read root
  150. $root = $this->pdf_find_root();
  151. if ($root ===false) {
  152. $this->success = false;
  153. return false;
  154. }
  155. $this->root = $this->pdf_resolve_object($this->c, $root);
  156. }
  157. /**
  158. * Find the xref-Table
  159. */
  160. function pdf_find_xref() {
  161. fseek ($this->f, -min(filesize($this->filename),1500), SEEK_END);
  162. $data = fread($this->f, 1500);
  163. $pos = strlen($data) - strpos(strrev($data), strrev('startxref'));
  164. $data = substr($data, $pos);
  165. if (!preg_match('/\s*(\d+).*$/s', $data, $matches)) {
  166. // mPDF 4.0
  167. $this->success = false;
  168. $this->errormsg = sprintf("Unable to find pointer to xref table");
  169. return false;
  170. }
  171. return (int) $matches[1];
  172. }
  173. /**
  174. * Read xref-table
  175. *
  176. * @param array $result Array of xref-table
  177. * @param integer $offset of xref-table
  178. * @param integer $start start-position in xref-table
  179. * @param integer $end end-position in xref-table
  180. */
  181. function pdf_read_xref(&$result, $offset, $start = null, $end = null) {
  182. if (is_null ($start) || is_null ($end)) {
  183. fseek($this->f, $o_pos = $offset);
  184. $data = trim(fgets($this->f,1024));
  185. if (strlen($data) == 0)
  186. $data = trim(fgets($this->f,1024));
  187. if ($data !== 'xref') {
  188. fseek($this->f, $o_pos);
  189. $data = trim(_fgets($this->f, true));
  190. if ($data !== 'xref') {
  191. if (preg_match('/(.*xref)(.*)/m', $data, $m)) { // xref 0 128 - in one line
  192. fseek($this->f, $o_pos+strlen($m[1]));
  193. } elseif (preg_match('/(x|r|e|f)+/', $data, $m)) { // correct invalid xref-pointer
  194. $tmpOffset = $offset-4+strlen($m[0]);
  195. $this->pdf_read_xref($result, $tmpOffset, $start, $end);
  196. return;
  197. } else {
  198. // mPDF 4.0
  199. $this->success = false;
  200. $this->errormsg = sprintf("Unable to find xref table - Maybe a Problem with 'auto_detect_line_endings'");
  201. return;
  202. }
  203. }
  204. }
  205. $o_pos = ftell($this->f);
  206. $data = explode(' ', trim(fgets($this->f,1024)));
  207. if (count($data) != 2) {
  208. fseek($this->f, $o_pos);
  209. $data = explode(' ', trim(_fgets($this->f, true)));
  210. if (count($data) != 2) {
  211. if (count($data) > 2) { // no lineending
  212. $n_pos = $o_pos+strlen($data[0])+strlen($data[1])+2;
  213. fseek($this->f, $n_pos);
  214. } else {
  215. // mPDF 4.0
  216. $this->success = false;
  217. $this->errormsg = sprintf("Unexpected header in xref table");
  218. return;
  219. }
  220. }
  221. }
  222. $start = $data[0];
  223. $end = $start + $data[1];
  224. }
  225. if (!isset($result['xref_location'])) {
  226. $result['xref_location'] = $offset;
  227. }
  228. if (!isset($result['max_object']) || $end > $result['max_object']) {
  229. $result['max_object'] = $end;
  230. }
  231. for (; $start < $end; $start++) {
  232. $data = ltrim(fread($this->f, 20)); // Spezifications says: 20 bytes including newlines
  233. $offset = substr($data, 0, 10);
  234. $generation = substr($data, 11, 5);
  235. if (!isset ($result['xref'][$start][(int) $generation])) {
  236. $result['xref'][$start][(int) $generation] = (int) $offset;
  237. }
  238. }
  239. $o_pos = ftell($this->f);
  240. $data = fgets($this->f,1024);
  241. if (strlen(trim($data)) == 0)
  242. $data = fgets($this->f, 1024);
  243. if (preg_match("/trailer/",$data)) {
  244. if (preg_match("/(.*trailer[ \n\r]*)/",$data,$m)) {
  245. fseek($this->f, $o_pos+strlen($m[1]));
  246. }
  247. // mPDF 5.0 Removed pass by reference =&
  248. $c = new pdf_context($this->f);
  249. $trailer = $this->pdf_read_value($c);
  250. if (isset($trailer[1]['/Prev'])) {
  251. $this->pdf_read_xref($result, $trailer[1]['/Prev'][1]);
  252. $result['trailer'][1] = array_merge($result['trailer'][1], $trailer[1]);
  253. } else {
  254. $result['trailer'] = $trailer;
  255. }
  256. } else {
  257. $data = explode(' ', trim($data));
  258. if (count($data) != 2) {
  259. fseek($this->f, $o_pos);
  260. $data = explode(' ', trim (_fgets ($this->f, true)));
  261. if (count($data) != 2) {
  262. // mPDF 4.0
  263. $this->success = false;
  264. $this->errormsg = sprintf("Unexpected data in xref table");
  265. return;
  266. }
  267. }
  268. $this->pdf_read_xref($result, null, (int) $data[0], (int) $data[0] + (int) $data[1]);
  269. }
  270. }
  271. /**
  272. * Reads an Value
  273. *
  274. * @param object $c pdf_context
  275. * @param string $token a Token
  276. * @return mixed
  277. */
  278. function pdf_read_value(&$c, $token = null) {
  279. if (is_null($token)) {
  280. $token = $this->pdf_read_token($c);
  281. }
  282. if ($token === false) {
  283. return false;
  284. }
  285. switch ($token) {
  286. case '<':
  287. // This is a hex string.
  288. // Read the value, then the terminator
  289. $pos = $c->offset;
  290. while(1) {
  291. $match = strpos ($c->buffer, '>', $pos);
  292. // If you can't find it, try
  293. // reading more data from the stream
  294. if ($match === false) {
  295. if (!$c->increase_length()) {
  296. return false;
  297. } else {
  298. continue;
  299. }
  300. }
  301. $result = substr ($c->buffer, $c->offset, $match - $c->offset);
  302. $c->offset = $match+1;
  303. return array (PDF_TYPE_HEX, $result);
  304. }
  305. break;
  306. case '<<':
  307. // This is a dictionary.
  308. $result = array();
  309. // Recurse into this function until we reach
  310. // the end of the dictionary.
  311. while (($key = $this->pdf_read_token($c)) !== '>>') {
  312. if ($key === false) {
  313. return false;
  314. }
  315. if (($value = $this->pdf_read_value($c)) === false) {
  316. return false;
  317. }
  318. $result[$key] = $value;
  319. }
  320. return array (PDF_TYPE_DICTIONARY, $result);
  321. case '[':
  322. // This is an array.
  323. $result = array();
  324. // Recurse into this function until we reach
  325. // the end of the array.
  326. while (($token = $this->pdf_read_token($c)) !== ']') {
  327. if ($token === false) {
  328. return false;
  329. }
  330. if (($value = $this->pdf_read_value($c, $token)) === false) {
  331. return false;
  332. }
  333. $result[] = $value;
  334. }
  335. return array (PDF_TYPE_ARRAY, $result);
  336. case '(' :
  337. // This is a string
  338. $pos = $c->offset;
  339. while(1) {
  340. // Start by finding the next closed
  341. // parenthesis
  342. $match = strpos ($c->buffer, ')', $pos);
  343. // If you can't find it, try
  344. // reading more data from the stream
  345. if ($match === false) {
  346. if (!$c->increase_length()) {
  347. return false;
  348. } else {
  349. continue;
  350. }
  351. }
  352. // Make sure that there is no backslash
  353. // before the parenthesis. If there is,
  354. // move on. Otherwise, return the string.
  355. $esc = preg_match('/([\\\\]+)$/', $tmpresult = substr($c->buffer, $c->offset, $match - $c->offset), $m);
  356. if ($esc === 0 || strlen($m[1]) % 2 == 0) {
  357. $result = $tmpresult;
  358. $c->offset = $match + 1;
  359. return array (PDF_TYPE_STRING, $result);
  360. } else {
  361. $pos = $match + 1;
  362. if ($pos > $c->offset + $c->length) {
  363. $c->increase_length();
  364. }
  365. }
  366. }
  367. case "stream":
  368. $o_pos = ftell($c->file)-strlen($c->buffer);
  369. $o_offset = $c->offset;
  370. $c->reset($startpos = $o_pos + $o_offset);
  371. $e = 0; // ensure line breaks in front of the stream
  372. if ($c->buffer[0] == chr(10) || $c->buffer[0] == chr(13))
  373. $e++;
  374. if ($c->buffer[1] == chr(10) && $c->buffer[0] != chr(10))
  375. $e++;
  376. if ($this->actual_obj[1][1]['/Length'][0] == PDF_TYPE_OBJREF) {
  377. // mPDF 5.0 Removed pass by reference =&
  378. $tmp_c = new pdf_context($this->f);
  379. $tmp_length = $this->pdf_resolve_object($tmp_c,$this->actual_obj[1][1]['/Length']);
  380. $length = $tmp_length[1][1];
  381. } else {
  382. $length = $this->actual_obj[1][1]['/Length'][1];
  383. }
  384. if ($length > 0) {
  385. $c->reset($startpos+$e,$length);
  386. $v = $c->buffer;
  387. } else {
  388. $v = '';
  389. }
  390. $c->reset($startpos+$e+$length+9); // 9 = strlen("endstream")
  391. return array(PDF_TYPE_STREAM, $v);
  392. default :
  393. if (is_numeric ($token)) {
  394. // A numeric token. Make sure that
  395. // it is not part of something else.
  396. if (($tok2 = $this->pdf_read_token ($c)) !== false) {
  397. if (is_numeric ($tok2)) {
  398. // Two numeric tokens in a row.
  399. // In this case, we're probably in
  400. // front of either an object reference
  401. // or an object specification.
  402. // Determine the case and return the data
  403. if (($tok3 = $this->pdf_read_token ($c)) !== false) {
  404. switch ($tok3) {
  405. case 'obj' :
  406. return array (PDF_TYPE_OBJDEC, (int) $token, (int) $tok2);
  407. case 'R' :
  408. return array (PDF_TYPE_OBJREF, (int) $token, (int) $tok2);
  409. }
  410. // If we get to this point, that numeric value up
  411. // there was just a numeric value. Push the extra
  412. // tokens back into the stack and return the value.
  413. array_push ($c->stack, $tok3);
  414. }
  415. }
  416. array_push ($c->stack, $tok2);
  417. }
  418. return array (PDF_TYPE_NUMERIC, $token);
  419. } else {
  420. // Just a token. Return it.
  421. return array (PDF_TYPE_TOKEN, $token);
  422. }
  423. }
  424. }
  425. /**
  426. * Resolve an object
  427. *
  428. * @param object $c pdf_context
  429. * @param array $obj_spec The object-data
  430. * @param boolean $encapsulate Must set to true, cause the parsing and fpdi use this method only without this para
  431. */
  432. function pdf_resolve_object(&$c, $obj_spec, $encapsulate = true) {
  433. // Exit if we get invalid data
  434. if (!is_array($obj_spec)) {
  435. return false;
  436. }
  437. if ($obj_spec[0] == PDF_TYPE_OBJREF) {
  438. // This is a reference, resolve it
  439. if (isset($this->xref['xref'][$obj_spec[1]][$obj_spec[2]])) {
  440. // Save current file position
  441. // This is needed if you want to resolve
  442. // references while you're reading another object
  443. // (e.g.: if you need to determine the length
  444. // of a stream)
  445. $old_pos = ftell($c->file);
  446. // Reposition the file pointer and
  447. // load the object header.
  448. $c->reset($this->xref['xref'][$obj_spec[1]][$obj_spec[2]]);
  449. $header = $this->pdf_read_value($c,null,true);
  450. if ($header[0] != PDF_TYPE_OBJDEC || $header[1] != $obj_spec[1] || $header[2] != $obj_spec[2]) {
  451. // mPDF 4.0
  452. $this->success = false;
  453. $this->errormsg = sprintf("Unable to find object ({$obj_spec[1]}, {$obj_spec[2]}) at expected location");
  454. return false;
  455. }
  456. // If we're being asked to store all the information
  457. // about the object, we add the object ID and generation
  458. // number for later use
  459. $this->actual_obj =& $result;
  460. if ($encapsulate) {
  461. $result = array (
  462. PDF_TYPE_OBJECT,
  463. 'obj' => $obj_spec[1],
  464. 'gen' => $obj_spec[2]
  465. );
  466. } else {
  467. $result = array();
  468. }
  469. // Now simply read the object data until
  470. // we encounter an end-of-object marker
  471. while(1) {
  472. $value = $this->pdf_read_value($c);
  473. if ($value === false || count($result) > 4) {
  474. // in this case the parser coudn't find an endobj so we break here
  475. break;
  476. }
  477. if ($value[0] == PDF_TYPE_TOKEN && $value[1] === 'endobj') {
  478. break;
  479. }
  480. $result[] = $value;
  481. }
  482. $c->reset($old_pos);
  483. if (isset($result[2][0]) && $result[2][0] == PDF_TYPE_STREAM) {
  484. $result[0] = PDF_TYPE_STREAM;
  485. }
  486. return $result;
  487. }
  488. } else {
  489. return $obj_spec;
  490. }
  491. }
  492. /**
  493. * Reads a token from the file
  494. *
  495. * @param object $c pdf_context
  496. * @return mixed
  497. */
  498. function pdf_read_token(&$c)
  499. {
  500. // If there is a token available
  501. // on the stack, pop it out and
  502. // return it.
  503. if (count($c->stack)) {
  504. return array_pop($c->stack);
  505. }
  506. // Strip away any whitespace
  507. do {
  508. if (!$c->ensure_content()) {
  509. return false;
  510. }
  511. $c->offset += _strspn($c->buffer, " \n\r\t", $c->offset);
  512. } while ($c->offset >= $c->length - 1);
  513. // Get the first character in the stream
  514. $char = $c->buffer[$c->offset++];
  515. switch ($char) {
  516. case '[' :
  517. case ']' :
  518. case '(' :
  519. case ')' :
  520. // This is either an array or literal string
  521. // delimiter, Return it
  522. return $char;
  523. case '<' :
  524. case '>' :
  525. // This could either be a hex string or
  526. // dictionary delimiter. Determine the
  527. // appropriate case and return the token
  528. if ($c->buffer[$c->offset] == $char) {
  529. if (!$c->ensure_content()) {
  530. return false;
  531. }
  532. $c->offset++;
  533. return $char . $char;
  534. } else {
  535. return $char;
  536. }
  537. default :
  538. // This is "another" type of token (probably
  539. // a dictionary entry or a numeric value)
  540. // Find the end and return it.
  541. if (!$c->ensure_content()) {
  542. return false;
  543. }
  544. while(1) {
  545. // Determine the length of the token
  546. $pos = _strcspn($c->buffer, " []<>()\r\n\t/", $c->offset);
  547. if ($c->offset + $pos <= $c->length - 1) {
  548. break;
  549. } else {
  550. // If the script reaches this point,
  551. // the token may span beyond the end
  552. // of the current buffer. Therefore,
  553. // we increase the size of the buffer
  554. // and try again--just to be safe.
  555. $c->increase_length();
  556. }
  557. }
  558. $result = substr($c->buffer, $c->offset - 1, $pos + 1);
  559. $c->offset += $pos;
  560. return $result;
  561. }
  562. }
  563. }
  564. ?>