tmgmt_file.format.xliff.inc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635
  1. <?php
  2. /**
  3. * Export to XLIFF format.
  4. *
  5. * The XLIFF processor follows this specification:
  6. * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html
  7. *
  8. * The purpose of this class is to mask or process HTML elements in the source
  9. * and target elements so that translation tools are able to understand which
  10. * content needs to be translated and ignored.
  11. *
  12. * On the other hand we need to properly unmask the XLIFF markup back to HTML on
  13. * the translation import. So the process is bidirectional and prior to running
  14. * the unmasking process we try to validate the integrity in the
  15. * validateJobTranslationUponImport() method. Currently the integrity check
  16. * involves only a counter of XLIFF elements that have been created during
  17. * source processing and has to mach number of XLIFF elements being imported
  18. * with the translation.
  19. *
  20. * To process the content DOMDocument object is used due to its ability to
  21. * read broken HTML. This also implies that if broken HTML is in the source
  22. * content the translation content will be fixed into the extend of DOMDocument
  23. * abilities.
  24. *
  25. * Following is implemented:
  26. * - All pair tags get escaped using <bpt><ept> markup.
  27. * - <br> tags are marked with <x ctype="lb">.
  28. * - <img> tags are marked with <ph ctype="image"> tags. The title and alt
  29. * attributes should have been extracted into <sub> elements, however are not
  30. * as Trados studio triggers a fatal error in case there are two <sub>
  31. * elements at the same level.
  32. *
  33. * Not implemented:
  34. * - Attributes of <img> element are written only as attributes of <ph> element
  35. * instead of using x-html: prefix. This results in conflict with own <ph>
  36. * element's attributes such as "id". The reason why x-html prefix has not
  37. * been used is that Trados studio triggered fatal error on xml validation.
  38. * - Translatable attributes like title and alt.
  39. * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#elem_img
  40. * - Forms - this is big part
  41. * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#HTMLForms
  42. * - <pre> elements
  43. * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#Elem_preformatted
  44. */
  45. class TMGMTFileformatXLIFF extends XMLWriter implements TMGMTFileFormatInterface {
  46. /**
  47. * Contains a reference to the currently being exported job.
  48. *
  49. * @var TMGMTJob
  50. */
  51. protected $job;
  52. protected $importedXML;
  53. protected $importedTransUnits;
  54. /**
  55. * Adds a job item to the xml export.
  56. *
  57. * @param $item
  58. * The job item entity.
  59. */
  60. protected function addItem(TMGMTJobItem $item) {
  61. $this->startElement('group');
  62. $this->writeAttribute('id', $item->tjiid);
  63. // Add a note for the source label.
  64. $this->writeElement('note', $item->getSourceLabel());
  65. // @todo: Write in nested groups instead of flattening it.
  66. $data = array_filter(tmgmt_flatten_data($item->getData()), '_tmgmt_filter_data');
  67. foreach ($data as $key => $element) {
  68. $this->addTransUnit($item->tjiid . '][' . $key, $element, $this->job);
  69. }
  70. $this->endElement();
  71. }
  72. /**
  73. * Adds a single translation unit for a data element.
  74. *
  75. * @param $key
  76. * The unique identifier for this data element.
  77. * @param $element
  78. * Array with the properties #text and optionally #label.
  79. * @param TMGMTJob $job
  80. * Translation job.
  81. */
  82. protected function addTransUnit($key, $element, TMGMTJob $job) {
  83. $key_array = tmgmt_ensure_keys_array($key);
  84. $this->startElement('trans-unit');
  85. $this->writeAttribute('id', $key);
  86. $this->writeAttribute('resname', $key);
  87. $this->startElement('source');
  88. $this->writeAttribute('xml:lang', $this->job->getTranslator()->mapToRemoteLanguage($this->job->source_language));
  89. if ($job->getSetting('xliff_cdata')) {
  90. $this->writeCdata(trim($element['#text']));
  91. }
  92. elseif ($job->getSetting('xliff_processing')) {
  93. $this->writeRaw($this->processForExport($element['#text'], $key_array));
  94. }
  95. else {
  96. $this->text($element['#text']);
  97. }
  98. $this->endElement();
  99. $this->startElement('target');
  100. $this->writeAttribute('xml:lang', $this->job->getTranslator()->mapToRemoteLanguage($this->job->target_language));
  101. if (!empty($element['#translation']['#text'])) {
  102. if ($job->getSetting('xliff_processing')) {
  103. $this->writeRaw($this->processForExport($element['#translation']['#text'], $key_array));
  104. }
  105. else {
  106. $this->text($element['#translation']['#text']);
  107. }
  108. }
  109. $this->endElement();
  110. if (isset($element['#label'])) {
  111. $this->writeElement('note', $element['#label']);
  112. }
  113. $this->endElement();
  114. }
  115. /**
  116. * {@inheritdoc}
  117. */
  118. public function export(TMGMTJob $job, $conditions = array()) {
  119. $this->job = $job;
  120. $this->openMemory();
  121. $this->setIndent(true);
  122. $this->setIndentString(' ');
  123. $this->startDocument('1.0', 'UTF-8');
  124. // Root element with schema definition.
  125. $this->startElement('xliff');
  126. $this->writeAttribute('version', '1.2');
  127. $this->writeAttribute('xmlns', 'urn:oasis:names:tc:xliff:document:1.2');
  128. $this->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance');
  129. $this->writeAttribute('xsi:schemaLocation', 'urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-strict.xsd');
  130. // File element.
  131. $this->startElement('file');
  132. $this->writeAttribute('original', 'xliff-core-1.2-strict.xsd');
  133. $this->writeAttribute('source-language', $job->getTranslator()->mapToRemoteLanguage($job->source_language));
  134. $this->writeAttribute('target-language', $job->getTranslator()->mapToRemoteLanguage($job->target_language));
  135. $this->writeAttribute('datatype', 'plaintext');
  136. // Date needs to be in ISO-8601 UTC
  137. $this->writeAttribute('date', date('Y-m-d\Th:m:i\Z'));
  138. $this->startElement('header');
  139. $this->startElement('phase-group');
  140. $this->startElement('phase');
  141. $this->writeAttribute('tool-id', 'tmgmt');
  142. $this->writeAttribute('phase-name', 'extraction');
  143. $this->writeAttribute('process-name', 'extraction');
  144. $this->writeAttribute('job-id', $job->tjid);
  145. $this->endElement();
  146. $this->endElement();
  147. $this->startElement('tool');
  148. $this->writeAttribute('tool-id', 'tmgmt');
  149. $this->writeAttribute('tool-name', 'Drupal Translation Management Tools');
  150. $this->endElement();
  151. $this->endElement();
  152. $this->startElement('body');
  153. foreach ($job->getItems($conditions) as $item) {
  154. $this->addItem($item);
  155. }
  156. // End the body, file and xliff tags.
  157. $this->endElement();
  158. $this->endElement();
  159. $this->endElement();
  160. $this->endDocument();
  161. return $this->outputMemory();
  162. }
  163. /**
  164. * {@inheritdoc}
  165. */
  166. public function import($imported_file, $is_file = TRUE) {
  167. if (!$this->getImportedXML($imported_file, $is_file)) {
  168. return FALSE;
  169. }
  170. $phase = $this->importedXML->xpath("//xliff:phase[@phase-name='extraction']");
  171. $phase = reset($phase);
  172. $job = tmgmt_job_load((string) $phase['job-id']);
  173. return tmgmt_unflatten_data($this->getImportedTargets($job));
  174. }
  175. /**
  176. * {@inheritdoc}
  177. */
  178. public function validateImport($imported_file) {
  179. // Validates imported XLIFF file.
  180. // Checks:
  181. // - Job ID
  182. // - Target ans source languages
  183. // - Content integrity.
  184. if (!($xml = $this->getImportedXML($imported_file))) {
  185. drupal_set_message(t('The imported file is not a valid XML.'), 'error');
  186. return FALSE;
  187. }
  188. // Check if our phase information is there.
  189. $phase = $xml->xpath("//xliff:phase[@phase-name='extraction']");
  190. if ($phase) {
  191. $phase = reset($phase);
  192. }
  193. else {
  194. drupal_set_message(t('The imported file is missing required XLIFF phase information.'), 'error');
  195. return FALSE;
  196. }
  197. // Check if the job has a valid job reference.
  198. if (!isset($phase['job-id'])) {
  199. drupal_set_message(t('The imported file does not contain a job reference.'), 'error');
  200. return FALSE;
  201. }
  202. // Attempt to load the job if none passed.
  203. $job = tmgmt_job_load((int) $phase['job-id']);
  204. if (empty($job)) {
  205. drupal_set_message(t('The imported file job id @file_tjid is not available.', array(
  206. '@file_tjid' => $phase['job-id'],
  207. )), 'error');
  208. return FALSE;
  209. }
  210. // @todo We use the $job to addMessage in case of failure. However the job
  211. // context is not safe at this point.
  212. // Compare source language.
  213. if (!isset($xml->file['source-language']) || $job->getTranslator()->mapToRemoteLanguage($job->source_language) != $xml->file['source-language']) {
  214. $job->addMessage('The imported file source language @file_language does not match the job source language @job_language.', array(
  215. '@file_language' => empty($xml->file['source-language']) ? t('none') : $xml->file['source-language'],
  216. '@job_language' => $job->source_language,
  217. ), 'error');
  218. return FALSE;
  219. }
  220. // Compare target language.
  221. if (!isset($xml->file['target-language']) || $job->getTranslator()->mapToRemoteLanguage($job->target_language) != $xml->file['target-language']) {
  222. $job->addMessage('The imported file target language @file_language does not match the job target language @job_language.', array(
  223. '@file_language' => empty($xml->file['target-language']) ? t('none') : $xml->file['target-language'],
  224. '@job_language' => $job->target_language,
  225. ), 'error');
  226. return FALSE;
  227. }
  228. $targets = $this->getImportedTargets($job);
  229. if (empty($targets)) {
  230. $job->addMessage('The imported file seems to be missing translation.', 'error');
  231. return FALSE;
  232. }
  233. // In case we do not do xliff processing we cannot do the elements
  234. // count validation.
  235. if (!$job->getSetting('xliff_processing')) {
  236. return $job;
  237. }
  238. $reader = new XMLReader();
  239. $xliff_validation = $job->getSetting('xliff_validation');
  240. foreach ($targets as $id => $target) {
  241. $array_key = tmgmt_ensure_keys_array($id);
  242. $job_item = tmgmt_job_item_load(array_shift($array_key));
  243. $count = 0;
  244. $reader->XML('<translation>' . $target['#text'] . '</translation>');
  245. while ($reader->read()) {
  246. if (in_array($reader->name, array('translation', '#text'))) {
  247. continue;
  248. }
  249. $count++;
  250. }
  251. if (!isset($xliff_validation[$id]) || $xliff_validation[$id] != $count) {
  252. $job_item->addMessage('Failed to validate semantic integrity of %key element. Please check also the HTML code of the element in the review process.',
  253. array('%key' => tmgmt_ensure_keys_string($array_key)));
  254. }
  255. }
  256. // Validation successful.
  257. return $job;
  258. }
  259. /**
  260. * Returns the simple XMLElement object.
  261. *
  262. * @param string $imported_file
  263. * Path to a file or an XML string to import.
  264. * @param bool $is_file
  265. * (optional) Whether $imported_file is the path to a file or not.
  266. *
  267. * @return bool|\SimpleXMLElement
  268. * The parsed SimpleXMLElement object. FALSE in case of failed parsing.
  269. */
  270. protected function getImportedXML($imported_file, $is_file = TRUE) {
  271. if (empty($this->importedXML)) {
  272. // It is not possible to load the file directly with simplexml as it gets
  273. // url encoded due to the temporary://. This is a PHP bug, see
  274. // https://bugs.php.net/bug.php?id=61469
  275. if ($is_file) {
  276. $imported_file = file_get_contents($imported_file);
  277. }
  278. if (!($this->importedXML = simplexml_load_string($imported_file))) {
  279. return FALSE;
  280. }
  281. // Register the XLIFF namespace, required for xpath.
  282. $this->importedXML->registerXPathNamespace('xliff', 'urn:oasis:names:tc:xliff:document:1.2');
  283. }
  284. return $this->importedXML;
  285. }
  286. protected function getImportedTargets(TMGMTJob $job) {
  287. if (empty($this->importedXML)) {
  288. return FALSE;
  289. }
  290. if (empty($this->importedTransUnits)) {
  291. $reader = new XMLReader();
  292. foreach ($this->importedXML->xpath('//xliff:trans-unit') as $unit) {
  293. if (!$job->getSetting('xliff_processing')) {
  294. $this->importedTransUnits[(string) $unit['id']]['#text'] = (string) $unit->target;
  295. continue;
  296. }
  297. $reader->XML($unit->target->asXML());
  298. $reader->read();
  299. $this->importedTransUnits[(string) $unit['id']]['#text'] =
  300. $this->processForImport($reader->readInnerXML(), $job);
  301. }
  302. }
  303. return $this->importedTransUnits;
  304. }
  305. /**
  306. * Processes trans-unit/target to rebuild back the HTML.
  307. *
  308. * @param string $translation
  309. * Job data array.
  310. * @param TMGMTJob $job
  311. * Translation job.
  312. *
  313. * @return string
  314. */
  315. protected function processForImport($translation, TMGMTJob $job) {
  316. // In case we do not want to do xliff processing return the translation as
  317. // is.
  318. if (!$job->getSetting('xliff_processing')) {
  319. return $translation;
  320. }
  321. $reader = new XMLReader();
  322. $reader->XML('<translation>' . $translation . '</translation>');
  323. $text = '';
  324. while ($reader->read()) {
  325. // If the current element is text append it to the result text.
  326. if ($reader->name == '#text' || $reader->name == '#cdata-section') {
  327. $text .= $reader->value;
  328. }
  329. elseif ($reader->name == 'x') {
  330. if ($reader->getAttribute('ctype') == 'lb') {
  331. $text .= '<br />';
  332. }
  333. }
  334. elseif ($reader->name == 'ph') {
  335. if ($reader->getAttribute('ctype') == 'image') {
  336. $text .= '<img';
  337. while ($reader->moveToNextAttribute()) {
  338. // @todo - we have to use x-html: prefixes for attributes.
  339. if ($reader->name != 'ctype' && $reader->name != 'id') {
  340. $text .= " {$reader->name}=\"{$reader->value}\"";
  341. }
  342. }
  343. $text .= ' />';
  344. }
  345. }
  346. }
  347. return $text;
  348. }
  349. /**
  350. * Helper function to process the source text.
  351. *
  352. * @param string $source
  353. * Job data array.
  354. * @param array $key_array
  355. * The source item data key.
  356. *
  357. * @return string
  358. */
  359. protected function processForExport($source, array $key_array) {
  360. $tjiid = $key_array[0];
  361. $key_string = tmgmt_ensure_keys_string($key_array);
  362. // The reason why we use DOMDocument object here and not just XMLReader
  363. // is the DOMDocument's ability to deal with broken HTML.
  364. $dom = new DOMDocument();
  365. // We need to append the head with encoding so that special characters
  366. // are read correctly.
  367. $dom->loadHTML("<html><head><meta http-equiv='Content-type' content='text/html; charset=UTF-8' /></head><body>" . $source . '</body></html>');
  368. $iterator = new RecursiveIteratorIterator(
  369. new RecursiveDOMIterator($dom),
  370. RecursiveIteratorIterator::SELF_FIRST);
  371. $writer = new XMLWriter();
  372. $writer->openMemory();
  373. $writer->startDocument('1.0', 'UTF-8');
  374. $writer->startElement('wrapper');
  375. $tray = array();
  376. $non_pair_tags = array('br', 'img');
  377. if (!isset($this->job->settings['xliff_validation'])) {
  378. $this->job->settings['xliff_validation'] = array();
  379. }
  380. $xliff_validation = $this->job->settings['xliff_validation'];
  381. /** @var DOMElement $node */
  382. foreach ($iterator as $node) {
  383. if (in_array($node->nodeName, array('html', 'body', 'head', 'meta'))) {
  384. continue;
  385. }
  386. if ($node->nodeType === XML_ELEMENT_NODE) {
  387. // Increment the elements count and compose element id.
  388. if (!isset($xliff_validation[$key_string])) {
  389. $xliff_validation[$key_string] = 0;
  390. }
  391. $xliff_validation[$key_string]++;
  392. $id = 'tjiid' . $tjiid . '-' . $xliff_validation[$key_string];
  393. $is_pair_tag = !in_array($node->nodeName, $non_pair_tags);
  394. if ($is_pair_tag) {
  395. $this->writeBPT($writer, $node, $id);
  396. }
  397. elseif ($node->nodeName == 'img') {
  398. $this->writeIMG($writer, $node, $id);
  399. }
  400. elseif ($node->nodeName == 'br') {
  401. $this->writeBR($writer, $node, $id);
  402. }
  403. // Add to tray new element info.
  404. $tray[$id] = array(
  405. 'name' => $node->nodeName,
  406. 'id' => $id,
  407. 'value' => $node->nodeValue,
  408. 'built_text' => '',
  409. 'is_pair_tag' => $is_pair_tag,
  410. );
  411. }
  412. // The current node is a text.
  413. elseif ($node->nodeName == '#text') {
  414. // Add the node value to the text output.
  415. $writer->writeCdata($this->toEntities($node->nodeValue));
  416. foreach ($tray as &$info) {
  417. $info['built_text'] .= $node->nodeValue;
  418. }
  419. }
  420. // Reverse so that pair tags are closed in the expected order.
  421. $reversed_tray = array_reverse($tray);
  422. foreach ($reversed_tray as $_info) {
  423. // If the build_text equals to the node value and it is not a pair tag
  424. // add the end pair tag markup.
  425. if ($_info['value'] == $_info['built_text'] && $_info['is_pair_tag']) {
  426. // Count also for the closing elements.
  427. $xliff_validation[$key_string]++;
  428. $this->writeEPT($writer, $_info['name'], $_info['id']);
  429. // When the end pair tag has been written unset the element info
  430. // from the tray.
  431. unset($tray[$_info['id']]);
  432. }
  433. }
  434. }
  435. // Set the xliff_validation data and save the job.
  436. $this->job->settings['xliff_validation'] = $xliff_validation;
  437. $this->job->save();
  438. $writer->endElement();
  439. // Load the output with XMLReader so that we can easily get the inner xml.
  440. $reader = new XMLReader();
  441. $reader->XML($writer->outputMemory());
  442. $reader->read();
  443. return $reader->readInnerXML();
  444. }
  445. /**
  446. * Writes br tag.
  447. *
  448. * @param XMLWriter $writer
  449. * Writer that writes the output.
  450. * @param DOMElement $node
  451. * Current node.
  452. * @param $id
  453. * Current node id.
  454. */
  455. protected function writeBR(XMLWriter $writer, DOMElement $node, $id) {
  456. $writer->startElement('x');
  457. $writer->writeAttribute('id', $id);
  458. $writer->writeAttribute('ctype', 'lb');
  459. $writer->endElement();
  460. }
  461. /**
  462. * Writes beginning pair tag.
  463. *
  464. * @param XMLWriter $writer
  465. * Writer that writes the output.
  466. * @param DOMElement $node
  467. * Current node.
  468. * @param $id
  469. * Current node id.
  470. */
  471. protected function writeBPT(XMLWriter $writer, DOMElement $node, $id) {
  472. $beginning_tag = '<' . $node->nodeName;
  473. if ($node->hasAttributes()) {
  474. $attributes = array();
  475. /** @var DOMAttr $attribute */
  476. foreach ($node->attributes as $attribute) {
  477. $attributes[] = $attribute->name . '="' . $attribute->value . '"';
  478. }
  479. $beginning_tag .= ' '. implode(' ', $attributes);
  480. }
  481. $beginning_tag .= '>';
  482. $writer->startElement('bpt');
  483. $writer->writeAttribute('id', $id);
  484. $writer->text($beginning_tag);
  485. $writer->endElement();
  486. }
  487. /**
  488. * Writes ending pair tag.
  489. *
  490. * @param XMLWriter $writer
  491. * Writer that writes the output.
  492. * @param string $name
  493. * Ending tag name.
  494. * @param $id
  495. * Current node id.
  496. */
  497. protected function writeEPT(XMLWriter $writer, $name, $id) {
  498. $writer->startElement('ept');
  499. $writer->writeAttribute('id', $id);
  500. $writer->text('</' . $name . '>');
  501. $writer->endElement();
  502. }
  503. /**
  504. * Writes img tag.
  505. *
  506. * Note that alt and title attributes are not written as sub elements as
  507. * Trados studio is not able to deal with two sub elements at one level.
  508. *
  509. * @param XMLWriter $writer
  510. * Writer that writes the output.
  511. * @param DOMElement $node
  512. * Current node.
  513. * @param $id
  514. * Current node id.
  515. */
  516. protected function writeIMG(XMLWriter $writer, DOMElement $node, $id) {
  517. $writer->startElement('ph');
  518. $writer->writeAttribute('id', $id);
  519. $writer->writeAttribute('ctype', 'image');
  520. foreach ($node->attributes as $attribute) {
  521. // @todo - uncomment when issue with Trados/sub elements fixed.
  522. /*
  523. if (in_array($attribute->name, array('title', 'alt'))) {
  524. continue;
  525. }
  526. */
  527. $writer->writeAttribute($attribute->name, $attribute->value);
  528. }
  529. /*
  530. if ($alt_attribute = $node->getAttribute('alt')) {
  531. $writer->startElement('sub');
  532. $writer->writeAttribute('id', $id . '-img-alt');
  533. $writer->writeAttribute('ctype', 'x-img-alt');
  534. $writer->text($alt_attribute);
  535. $writer->endElement();
  536. $this->elementsCount++;
  537. }
  538. if ($title_attribute = $node->getAttribute('title')) {
  539. $writer->startElement('sub');
  540. $writer->writeAttribute('id', $id . '-img-title');
  541. $writer->writeAttribute('ctype', 'x-img-title');
  542. $writer->text($title_attribute);
  543. $writer->endElement();
  544. $this->elementsCount++;
  545. }
  546. */
  547. $writer->endElement();
  548. }
  549. /**
  550. * Convert critical characters to HTML entities.
  551. *
  552. * DOMDocument will convert HTML entities to its actual characters. This can
  553. * lead into situation when not allowed characters will appear in the content.
  554. *
  555. * @param string $string
  556. * String to escape.
  557. *
  558. * @return string
  559. * Escaped string.
  560. */
  561. protected function toEntities($string) {
  562. return str_replace(array('&', '>', '<'), array('&amp;', '&gt;', '&lt;'), $string);
  563. }
  564. }