PoStreamReader.php 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. <?php
  2. /**
  3. * @file
  4. * Contains \Drupal\Component\Gettext\PoStreamReader.
  5. */
  6. /**
  7. * Implements Gettext PO stream reader.
  8. *
  9. * The PO file format parsing is implemented according to the documentation at
  10. * http://www.gnu.org/software/gettext/manual/gettext.html#PO-Files.
  11. */
  12. class PoStreamReader implements PoStreamInterface, PoReaderInterface {
  13. /**
  14. * Source line number of the stream being parsed.
  15. *
  16. * @var int
  17. */
  18. private $_line_number = 0;
  19. /**
  20. * Parser context for the stream reader state machine.
  21. *
  22. * Possible contexts are:
  23. * - 'COMMENT' (#)
  24. * - 'MSGID' (msgid)
  25. * - 'MSGID_PLURAL' (msgid_plural)
  26. * - 'MSGCTXT' (msgctxt)
  27. * - 'MSGSTR' (msgstr or msgstr[])
  28. * - 'MSGSTR_ARR' (msgstr_arg)
  29. *
  30. * @var string
  31. */
  32. private $_context = 'COMMENT';
  33. /**
  34. * Current entry being read. Incomplete.
  35. *
  36. * @var array
  37. */
  38. private $_current_item = array();
  39. /**
  40. * Current plural index for plural translations.
  41. *
  42. * @var int
  43. */
  44. private $_current_plural_index = 0;
  45. /**
  46. * URI of the PO stream that is being read.
  47. *
  48. * @var string
  49. */
  50. private $_uri = '';
  51. /**
  52. * Language code for the PO stream being read.
  53. *
  54. * @var string
  55. */
  56. private $_langcode = NULL;
  57. /**
  58. * Size of the current PO stream.
  59. *
  60. * @var int
  61. */
  62. private $_size;
  63. /**
  64. * File handle of the current PO stream.
  65. *
  66. * @var resource
  67. */
  68. private $_fd;
  69. /**
  70. * The PO stream header.
  71. *
  72. * @var PoHeader
  73. */
  74. private $_header;
  75. /**
  76. * Object wrapper for the last read source/translation pair.
  77. *
  78. * @var PoItem
  79. */
  80. private $_last_item;
  81. /**
  82. * Indicator of whether the stream reading is finished.
  83. *
  84. * @var boolean
  85. */
  86. private $_finished;
  87. /**
  88. * Array of translated error strings recorded on reading this stream so far.
  89. *
  90. * @var array
  91. */
  92. private $_errors;
  93. /**
  94. * Implements PoMetadataInterface::getLangcode().
  95. */
  96. public function getLangcode() {
  97. return $this->_langcode;
  98. }
  99. /**
  100. * Implements PoMetadataInterface::setLangcode().
  101. */
  102. public function setLangcode($langcode) {
  103. $this->_langcode = $langcode;
  104. }
  105. /**
  106. * Implements PoMetadataInterface::getHeader().
  107. */
  108. public function getHeader() {
  109. return $this->_header;
  110. }
  111. /**
  112. * Implements PoMetadataInterface::setHeader().
  113. *
  114. * Not applicable to stream reading and therefore not implemented.
  115. */
  116. public function setHeader(PoHeader $header) {
  117. }
  118. /**
  119. * Implements PoStreamInterface::getURI().
  120. */
  121. public function getURI() {
  122. return $this->_uri;
  123. }
  124. /**
  125. * Implements PoStreamInterface::setURI().
  126. */
  127. public function setURI($uri) {
  128. $this->_uri = $uri;
  129. }
  130. /**
  131. * Implements PoStreamInterface::open().
  132. *
  133. * Opens the stream and reads the header. The stream is ready for reading
  134. * items after.
  135. *
  136. * @throws Exception
  137. * If the URI is not yet set.
  138. */
  139. public function open() {
  140. if (!empty($this->_uri)) {
  141. $this->_fd = fopen($this->_uri, 'rb');
  142. $this->_size = ftell($this->_fd);
  143. $this->readHeader();
  144. }
  145. else {
  146. throw new \Exception('Cannot open stream without URI set.');
  147. }
  148. }
  149. /**
  150. * Implements PoStreamInterface::close().
  151. *
  152. * @throws Exception
  153. * If the stream is not open.
  154. */
  155. public function close() {
  156. if ($this->_fd) {
  157. fclose($this->_fd);
  158. }
  159. else {
  160. throw new \Exception('Cannot close stream that is not open.');
  161. }
  162. }
  163. /**
  164. * Implements PoReaderInterface::readItem().
  165. */
  166. public function readItem() {
  167. // Clear out the last item.
  168. $this->_last_item = NULL;
  169. // Read until finished with the stream or a complete item was identified.
  170. while (!$this->_finished && is_null($this->_last_item)) {
  171. $this->readLine();
  172. }
  173. return $this->_last_item;
  174. }
  175. /**
  176. * Sets the seek position for the current PO stream.
  177. *
  178. * @param int $seek
  179. * The new seek position to set.
  180. */
  181. public function setSeek($seek) {
  182. fseek($this->_fd, $seek);
  183. }
  184. /**
  185. * Returns the pointer position of the current PO stream.
  186. */
  187. public function getSeek() {
  188. return ftell($this->_fd);
  189. }
  190. /**
  191. * Read the header from the PO stream.
  192. *
  193. * The header is a special case PoItem, using the empty string as source and
  194. * key-value pairs as translation. We just reuse the item reader logic to
  195. * read the header.
  196. */
  197. private function readHeader() {
  198. $item = $this->readItem();
  199. // Handle the case properly when the .po file is empty (0 bytes).
  200. if (!$item) {
  201. return;
  202. }
  203. $header = new PoHeader();
  204. $header->setFromString(trim($item->getTranslation()));
  205. $this->_header = $header;
  206. }
  207. /**
  208. * Reads a line from the PO stream and stores data internally.
  209. *
  210. * Expands $this->_current_item based on new data for the current item. If
  211. * this line ends the current item, it is saved with setItemFromArray() with
  212. * data from $this->_current_item.
  213. *
  214. * An internal state machine is maintained in this reader using
  215. * $this->_context as the reading state. PO items are in between COMMENT
  216. * states (when items have at least one line or comment in between them or
  217. * indicated by MSGSTR or MSGSTR_ARR followed immediately by an MSGID or
  218. * MSGCTXT (when items closely follow each other).
  219. *
  220. * @return FALSE|NULL
  221. * FALSE if an error was logged, NULL otherwise. The errors are considered
  222. * non-blocking, so reading can continue, while the errors are collected
  223. * for later presentation.
  224. */
  225. private function readLine() {
  226. // Read a line and set the stream finished indicator if it was not
  227. // possible anymore.
  228. $line = fgets($this->_fd);
  229. $this->_finished = ($line === FALSE);
  230. // Initialize common values for error logging.
  231. $log_vars = array(
  232. '%uri' => $this->getURI(),
  233. '%line' => &$this->_line_number,
  234. );
  235. if (!$this->_finished) {
  236. if ($this->_line_number == 0) {
  237. // The first line might come with a UTF-8 BOM, which should be removed.
  238. $line = str_replace("\xEF\xBB\xBF", '', $line);
  239. // Current plurality for 'msgstr[]'.
  240. $this->_current_plural_index = 0;
  241. }
  242. // Track the line number for error reporting.
  243. $this->_line_number++;
  244. // Trim away the linefeed. \\n might appear at the end of the string if
  245. // another line continuing the same string follows. We can remove that.
  246. $line = trim(strtr($line, array("\\\n" => "")));
  247. if (!strncmp('#', $line, 1)) {
  248. // Lines starting with '#' are comments.
  249. if ($this->_context == 'COMMENT') {
  250. // Already in comment context, add to current comment.
  251. $this->_current_item['#'][] = substr($line, 1);
  252. }
  253. elseif (($this->_context == 'MSGSTR') || ($this->_context == 'MSGSTR_ARR')) {
  254. // We are currently in string context, save current item.
  255. $this->setItemFromArray($this->_current_item);
  256. // Start a new entry for the comment.
  257. $this->_current_item = array();
  258. $this->_current_item['#'][] = substr($line, 1);
  259. $this->_context = 'COMMENT';
  260. return NULL;
  261. }
  262. else {
  263. // A comment following any other context is a syntax error.
  264. $this->_errors[] = format_string('The translation stream %uri contains an error: "msgstr" was expected but not found on line %line.', $log_vars);
  265. return FALSE;
  266. }
  267. return NULL;
  268. }
  269. elseif (!strncmp('msgid_plural', $line, 12)) {
  270. // A plural form for the current source string.
  271. if ($this->_context != 'MSGID') {
  272. // A plural form can only be added to an msgid directly.
  273. $this->_errors[] = format_string('The translation stream %uri contains an error: "msgid_plural" was expected but not found on line %line.', $log_vars);
  274. return FALSE;
  275. }
  276. // Remove 'msgid_plural' and trim away whitespace.
  277. $line = trim(substr($line, 12));
  278. // Only the plural source string is left, parse it.
  279. $quoted = $this->parseQuoted($line);
  280. if ($quoted === FALSE) {
  281. // The plural form must be wrapped in quotes.
  282. $this->_errors[] = format_string('The translation stream %uri contains a syntax error on line %line.', $log_vars);
  283. return FALSE;
  284. }
  285. // Append the plural source to the current entry.
  286. if (is_string($this->_current_item['msgid'])) {
  287. // The first value was stored as string. Now we know the context is
  288. // plural, it is converted to array.
  289. $this->_current_item['msgid'] = array($this->_current_item['msgid']);
  290. }
  291. $this->_current_item['msgid'][] = $quoted;
  292. $this->_context = 'MSGID_PLURAL';
  293. return NULL;
  294. }
  295. elseif (!strncmp('msgid', $line, 5)) {
  296. // Starting a new message.
  297. if (($this->_context == 'MSGSTR') || ($this->_context == 'MSGSTR_ARR')) {
  298. // We are currently in string context, save current item.
  299. $this->setItemFromArray($this->_current_item);
  300. // Start a new context for the msgid.
  301. $this->_current_item = array();
  302. }
  303. elseif ($this->_context == 'MSGID') {
  304. // We are currently already in the context, meaning we passed an id
  305. // with no data.
  306. $this->_errors[] = format_string('The translation stream %uri contains an error: "msgid" is unexpected on line %line.', $log_vars);
  307. return FALSE;
  308. }
  309. // Remove 'msgid' and trim away whitespace.
  310. $line = trim(substr($line, 5));
  311. // Only the message id string is left, parse it.
  312. $quoted = $this->parseQuoted($line);
  313. if ($quoted === FALSE) {
  314. // The message id must be wrapped in quotes.
  315. $this->_errors[] = format_string('The translation stream %uri contains an error: invalid format for "msgid" on line %line.', $log_vars);
  316. return FALSE;
  317. }
  318. $this->_current_item['msgid'] = $quoted;
  319. $this->_context = 'MSGID';
  320. return NULL;
  321. }
  322. elseif (!strncmp('msgctxt', $line, 7)) {
  323. // Starting a new context.
  324. if (($this->_context == 'MSGSTR') || ($this->_context == 'MSGSTR_ARR')) {
  325. // We are currently in string context, save current item.
  326. $this->setItemFromArray($this->_current_item);
  327. $this->_current_item = array();
  328. }
  329. elseif (!empty($this->_current_item['msgctxt'])) {
  330. // A context cannot apply to another context.
  331. $this->_errors[] = format_string('The translation stream %uri contains an error: "msgctxt" is unexpected on line %line.', $log_vars);
  332. return FALSE;
  333. }
  334. // Remove 'msgctxt' and trim away whitespaces.
  335. $line = trim(substr($line, 7));
  336. // Only the msgctxt string is left, parse it.
  337. $quoted = $this->parseQuoted($line);
  338. if ($quoted === FALSE) {
  339. // The context string must be quoted.
  340. $this->_errors[] = format_string('The translation stream %uri contains an error: invalid format for "msgctxt" on line %line.', $log_vars);
  341. return FALSE;
  342. }
  343. $this->_current_item['msgctxt'] = $quoted;
  344. $this->_context = 'MSGCTXT';
  345. return NULL;
  346. }
  347. elseif (!strncmp('msgstr[', $line, 7)) {
  348. // A message string for a specific plurality.
  349. if (($this->_context != 'MSGID') &&
  350. ($this->_context != 'MSGCTXT') &&
  351. ($this->_context != 'MSGID_PLURAL') &&
  352. ($this->_context != 'MSGSTR_ARR')) {
  353. // Plural message strings must come after msgid, msgxtxt,
  354. // msgid_plural, or other msgstr[] entries.
  355. $this->_errors[] = format_string('The translation stream %uri contains an error: "msgstr[]" is unexpected on line %line.', $log_vars);
  356. return FALSE;
  357. }
  358. // Ensure the plurality is terminated.
  359. if (strpos($line, ']') === FALSE) {
  360. $this->_errors[] = format_string('The translation stream %uri contains an error: invalid format for "msgstr[]" on line %line.', $log_vars);
  361. return FALSE;
  362. }
  363. // Extract the plurality.
  364. $frombracket = strstr($line, '[');
  365. $this->_current_plural_index = substr($frombracket, 1, strpos($frombracket, ']') - 1);
  366. // Skip to the next whitespace and trim away any further whitespace,
  367. // bringing $line to the message text only.
  368. $line = trim(strstr($line, " "));
  369. $quoted = $this->parseQuoted($line);
  370. if ($quoted === FALSE) {
  371. // The string must be quoted.
  372. $this->_errors[] = format_string('The translation stream %uri contains an error: invalid format for "msgstr[]" on line %line.', $log_vars);
  373. return FALSE;
  374. }
  375. if (!isset($this->_current_item['msgstr']) || !is_array($this->_current_item['msgstr'])) {
  376. $this->_current_item['msgstr'] = array();
  377. }
  378. $this->_current_item['msgstr'][$this->_current_plural_index] = $quoted;
  379. $this->_context = 'MSGSTR_ARR';
  380. return NULL;
  381. }
  382. elseif (!strncmp("msgstr", $line, 6)) {
  383. // A string pair for an msgidid (with optional context).
  384. if (($this->_context != 'MSGID') && ($this->_context != 'MSGCTXT')) {
  385. // Strings are only valid within an id or context scope.
  386. $this->_errors[] = format_string('The translation stream %uri contains an error: "msgstr" is unexpected on line %line.', $log_vars);
  387. return FALSE;
  388. }
  389. // Remove 'msgstr' and trim away away whitespaces.
  390. $line = trim(substr($line, 6));
  391. // Only the msgstr string is left, parse it.
  392. $quoted = $this->parseQuoted($line);
  393. if ($quoted === FALSE) {
  394. // The string must be quoted.
  395. $this->_errors[] = format_string('The translation stream %uri contains an error: invalid format for "msgstr" on line %line.', $log_vars);
  396. return FALSE;
  397. }
  398. $this->_current_item['msgstr'] = $quoted;
  399. $this->_context = 'MSGSTR';
  400. return NULL;
  401. }
  402. elseif ($line != '') {
  403. // Anything that is not a token may be a continuation of a previous
  404. // token.
  405. $quoted = $this->parseQuoted($line);
  406. if ($quoted === FALSE) {
  407. // This string must be quoted.
  408. $this->_errors[] = format_string('The translation stream %uri contains an error: string continuation expected on line %line.', $log_vars);
  409. return FALSE;
  410. }
  411. // Append the string to the current item.
  412. if (($this->_context == 'MSGID') || ($this->_context == 'MSGID_PLURAL')) {
  413. if (is_array($this->_current_item['msgid'])) {
  414. // Add string to last array element for plural sources.
  415. $last_index = count($this->_current_item['msgid']) - 1;
  416. $this->_current_item['msgid'][$last_index] .= $quoted;
  417. }
  418. else {
  419. // Singular source, just append the string.
  420. $this->_current_item['msgid'] .= $quoted;
  421. }
  422. }
  423. elseif ($this->_context == 'MSGCTXT') {
  424. // Multiline context name.
  425. $this->_current_item['msgctxt'] .= $quoted;
  426. }
  427. elseif ($this->_context == 'MSGSTR') {
  428. // Multiline translation string.
  429. $this->_current_item['msgstr'] .= $quoted;
  430. }
  431. elseif ($this->_context == 'MSGSTR_ARR') {
  432. // Multiline plural translation string.
  433. $this->_current_item['msgstr'][$this->_current_plural_index] .= $quoted;
  434. }
  435. else {
  436. // No valid context to append to.
  437. $this->_errors[] = format_string('The translation stream %uri contains an error: unexpected string on line %line.', $log_vars);
  438. return FALSE;
  439. }
  440. return NULL;
  441. }
  442. }
  443. // Empty line read or EOF of PO stream, close out the last entry.
  444. if (($this->_context == 'MSGSTR') || ($this->_context == 'MSGSTR_ARR')) {
  445. $this->setItemFromArray($this->_current_item);
  446. $this->_current_item = array();
  447. }
  448. elseif ($this->_context != 'COMMENT') {
  449. $this->_errors[] = format_string('The translation stream %uri ended unexpectedly at line %line.', $log_vars);
  450. return FALSE;
  451. }
  452. return NULL;
  453. }
  454. /**
  455. * Store the parsed values as a PoItem object.
  456. */
  457. public function setItemFromArray($value) {
  458. $plural = FALSE;
  459. $comments = '';
  460. $textgroup = 'default';
  461. if (isset($value['#'])) {
  462. $comments = $this->shortenComments($value['#']);
  463. $textgroup = $this->fetchGroupFromComment($comments);
  464. }
  465. if (is_array($value['msgstr'])) {
  466. // Sort plural variants by their form index.
  467. ksort($value['msgstr']);
  468. $plural = TRUE;
  469. }
  470. $item = new PoItem();
  471. $item->setContext(isset($value['msgctxt']) ? $value['msgctxt'] : '');
  472. $item->setSource($value['msgid']);
  473. $item->setTranslation($value['msgstr']);
  474. $item->setPlural($plural);
  475. $item->setComment($comments);
  476. $item->setLangcode($this->_langcode);
  477. $item->setTextgroup($textgroup);
  478. $this->_last_item = $item;
  479. $this->_context = 'COMMENT';
  480. }
  481. /**
  482. * Parses a string in quotes.
  483. *
  484. * @param string $string
  485. * A string specified with enclosing quotes.
  486. *
  487. * @return string|FALSE
  488. * The string parsed from inside the quotes.
  489. */
  490. public function parseQuoted($string) {
  491. if (substr($string, 0, 1) != substr($string, -1, 1)) {
  492. // Start and end quotes must be the same.
  493. return FALSE;
  494. }
  495. $quote = substr($string, 0, 1);
  496. $string = substr($string, 1, -1);
  497. if ($quote == '"') {
  498. // Double quotes: strip slashes.
  499. return stripcslashes($string);
  500. }
  501. elseif ($quote == "'") {
  502. // Simple quote: return as-is.
  503. return $string;
  504. }
  505. else {
  506. // Unrecognized quote.
  507. return FALSE;
  508. }
  509. }
  510. /**
  511. * Generates a short, one-string version of the passed comment array.
  512. *
  513. * @param string|array $comment
  514. * An array of strings containing a comment.
  515. *
  516. * @return string
  517. * Short one-string version of the comment.
  518. */
  519. private function shortenComments($comment) {
  520. $comm = '';
  521. while (count($comment)) {
  522. $test = $comm . substr(array_shift($comment), 1) . ', ';
  523. if (strlen($comm) < 130) {
  524. $comm = $test;
  525. }
  526. else {
  527. break;
  528. }
  529. }
  530. return trim(substr($comm, 0, -2));
  531. }
  532. /**
  533. * Determine a translation text group using a source's comment-string.
  534. *
  535. * @param string $comment
  536. * Comment string.
  537. *
  538. * @return string
  539. * The comment's text group.
  540. */
  541. private function fetchGroupFromComment($comment) {
  542. // Only if i18n_string is installed, check for and set textgroups.
  543. if (module_exists('i18n_string') && strpos($comment, ':') !== FALSE) {
  544. // Fetch available textgroups.
  545. $groups = array_keys(i18n_string_group_info());
  546. // Parse textgroup from comment (assume default drupal exports).
  547. $comment_array = explode(':', $comment);
  548. if (!empty($comment_array) && in_array($comment_array[0], $groups)) {
  549. return $comment_array[0];
  550. }
  551. }
  552. return 'default';
  553. }
  554. }