123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635 |
- <?php
- /**
- * Export to XLIFF format.
- *
- * The XLIFF processor follows this specification:
- * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html
- *
- * The purpose of this class is to mask or process HTML elements in the source
- * and target elements so that translation tools are able to understand which
- * content needs to be translated and ignored.
- *
- * On the other hand we need to properly unmask the XLIFF markup back to HTML on
- * the translation import. So the process is bidirectional and prior to running
- * the unmasking process we try to validate the integrity in the
- * validateJobTranslationUponImport() method. Currently the integrity check
- * involves only a counter of XLIFF elements that have been created during
- * source processing and has to mach number of XLIFF elements being imported
- * with the translation.
- *
- * To process the content DOMDocument object is used due to its ability to
- * read broken HTML. This also implies that if broken HTML is in the source
- * content the translation content will be fixed into the extend of DOMDocument
- * abilities.
- *
- * Following is implemented:
- * - All pair tags get escaped using <bpt><ept> markup.
- * - <br> tags are marked with <x ctype="lb">.
- * - <img> tags are marked with <ph ctype="image"> tags. The title and alt
- * attributes should have been extracted into <sub> elements, however are not
- * as Trados studio triggers a fatal error in case there are two <sub>
- * elements at the same level.
- *
- * Not implemented:
- * - Attributes of <img> element are written only as attributes of <ph> element
- * instead of using x-html: prefix. This results in conflict with own <ph>
- * element's attributes such as "id". The reason why x-html prefix has not
- * been used is that Trados studio triggered fatal error on xml validation.
- * - Translatable attributes like title and alt.
- * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#elem_img
- * - Forms - this is big part
- * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#HTMLForms
- * - <pre> elements
- * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#Elem_preformatted
- */
- class TMGMTFileformatXLIFF extends XMLWriter implements TMGMTFileFormatInterface {
- /**
- * Contains a reference to the currently being exported job.
- *
- * @var TMGMTJob
- */
- protected $job;
- protected $importedXML;
- protected $importedTransUnits;
- /**
- * Adds a job item to the xml export.
- *
- * @param $item
- * The job item entity.
- */
- protected function addItem(TMGMTJobItem $item) {
- $this->startElement('group');
- $this->writeAttribute('id', $item->tjiid);
- // Add a note for the source label.
- $this->writeElement('note', $item->getSourceLabel());
- // @todo: Write in nested groups instead of flattening it.
- $data = array_filter(tmgmt_flatten_data($item->getData()), '_tmgmt_filter_data');
- foreach ($data as $key => $element) {
- $this->addTransUnit($item->tjiid . '][' . $key, $element, $this->job);
- }
- $this->endElement();
- }
- /**
- * Adds a single translation unit for a data element.
- *
- * @param $key
- * The unique identifier for this data element.
- * @param $element
- * Array with the properties #text and optionally #label.
- * @param TMGMTJob $job
- * Translation job.
- */
- protected function addTransUnit($key, $element, TMGMTJob $job) {
- $key_array = tmgmt_ensure_keys_array($key);
- $this->startElement('trans-unit');
- $this->writeAttribute('id', $key);
- $this->writeAttribute('resname', $key);
- $this->startElement('source');
- $this->writeAttribute('xml:lang', $this->job->getTranslator()->mapToRemoteLanguage($this->job->source_language));
- if ($job->getSetting('xliff_cdata')) {
- $this->writeCdata(trim($element['#text']));
- }
- elseif ($job->getSetting('xliff_processing')) {
- $this->writeRaw($this->processForExport($element['#text'], $key_array));
- }
- else {
- $this->text($element['#text']);
- }
- $this->endElement();
- $this->startElement('target');
- $this->writeAttribute('xml:lang', $this->job->getTranslator()->mapToRemoteLanguage($this->job->target_language));
- if (!empty($element['#translation']['#text'])) {
- if ($job->getSetting('xliff_processing')) {
- $this->writeRaw($this->processForExport($element['#translation']['#text'], $key_array));
- }
- else {
- $this->text($element['#translation']['#text']);
- }
- }
- $this->endElement();
- if (isset($element['#label'])) {
- $this->writeElement('note', $element['#label']);
- }
- $this->endElement();
- }
- /**
- * {@inheritdoc}
- */
- public function export(TMGMTJob $job, $conditions = array()) {
- $this->job = $job;
- $this->openMemory();
- $this->setIndent(true);
- $this->setIndentString(' ');
- $this->startDocument('1.0', 'UTF-8');
- // Root element with schema definition.
- $this->startElement('xliff');
- $this->writeAttribute('version', '1.2');
- $this->writeAttribute('xmlns', 'urn:oasis:names:tc:xliff:document:1.2');
- $this->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance');
- $this->writeAttribute('xsi:schemaLocation', 'urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-strict.xsd');
- // File element.
- $this->startElement('file');
- $this->writeAttribute('original', 'xliff-core-1.2-strict.xsd');
- $this->writeAttribute('source-language', $job->getTranslator()->mapToRemoteLanguage($job->source_language));
- $this->writeAttribute('target-language', $job->getTranslator()->mapToRemoteLanguage($job->target_language));
- $this->writeAttribute('datatype', 'plaintext');
- // Date needs to be in ISO-8601 UTC
- $this->writeAttribute('date', date('Y-m-d\Th:m:i\Z'));
- $this->startElement('header');
- $this->startElement('phase-group');
- $this->startElement('phase');
- $this->writeAttribute('tool-id', 'tmgmt');
- $this->writeAttribute('phase-name', 'extraction');
- $this->writeAttribute('process-name', 'extraction');
- $this->writeAttribute('job-id', $job->tjid);
- $this->endElement();
- $this->endElement();
- $this->startElement('tool');
- $this->writeAttribute('tool-id', 'tmgmt');
- $this->writeAttribute('tool-name', 'Drupal Translation Management Tools');
- $this->endElement();
- $this->endElement();
- $this->startElement('body');
- foreach ($job->getItems($conditions) as $item) {
- $this->addItem($item);
- }
- // End the body, file and xliff tags.
- $this->endElement();
- $this->endElement();
- $this->endElement();
- $this->endDocument();
- return $this->outputMemory();
- }
- /**
- * {@inheritdoc}
- */
- public function import($imported_file, $is_file = TRUE) {
- if (!$this->getImportedXML($imported_file, $is_file)) {
- return FALSE;
- }
- $phase = $this->importedXML->xpath("//xliff:phase[@phase-name='extraction']");
- $phase = reset($phase);
- $job = tmgmt_job_load((string) $phase['job-id']);
- return tmgmt_unflatten_data($this->getImportedTargets($job));
- }
- /**
- * {@inheritdoc}
- */
- public function validateImport($imported_file) {
- // Validates imported XLIFF file.
- // Checks:
- // - Job ID
- // - Target ans source languages
- // - Content integrity.
- if (!($xml = $this->getImportedXML($imported_file))) {
- drupal_set_message(t('The imported file is not a valid XML.'), 'error');
- return FALSE;
- }
- // Check if our phase information is there.
- $phase = $xml->xpath("//xliff:phase[@phase-name='extraction']");
- if ($phase) {
- $phase = reset($phase);
- }
- else {
- drupal_set_message(t('The imported file is missing required XLIFF phase information.'), 'error');
- return FALSE;
- }
- // Check if the job has a valid job reference.
- if (!isset($phase['job-id'])) {
- drupal_set_message(t('The imported file does not contain a job reference.'), 'error');
- return FALSE;
- }
- // Attempt to load the job if none passed.
- $job = tmgmt_job_load((int) $phase['job-id']);
- if (empty($job)) {
- drupal_set_message(t('The imported file job id @file_tjid is not available.', array(
- '@file_tjid' => $phase['job-id'],
- )), 'error');
- return FALSE;
- }
- // @todo We use the $job to addMessage in case of failure. However the job
- // context is not safe at this point.
- // Compare source language.
- if (!isset($xml->file['source-language']) || $job->getTranslator()->mapToRemoteLanguage($job->source_language) != $xml->file['source-language']) {
- $job->addMessage('The imported file source language @file_language does not match the job source language @job_language.', array(
- '@file_language' => empty($xml->file['source-language']) ? t('none') : $xml->file['source-language'],
- '@job_language' => $job->source_language,
- ), 'error');
- return FALSE;
- }
- // Compare target language.
- if (!isset($xml->file['target-language']) || $job->getTranslator()->mapToRemoteLanguage($job->target_language) != $xml->file['target-language']) {
- $job->addMessage('The imported file target language @file_language does not match the job target language @job_language.', array(
- '@file_language' => empty($xml->file['target-language']) ? t('none') : $xml->file['target-language'],
- '@job_language' => $job->target_language,
- ), 'error');
- return FALSE;
- }
- $targets = $this->getImportedTargets($job);
- if (empty($targets)) {
- $job->addMessage('The imported file seems to be missing translation.', 'error');
- return FALSE;
- }
- // In case we do not do xliff processing we cannot do the elements
- // count validation.
- if (!$job->getSetting('xliff_processing')) {
- return $job;
- }
- $reader = new XMLReader();
- $xliff_validation = $job->getSetting('xliff_validation');
- foreach ($targets as $id => $target) {
- $array_key = tmgmt_ensure_keys_array($id);
- $job_item = tmgmt_job_item_load(array_shift($array_key));
- $count = 0;
- $reader->XML('<translation>' . $target['#text'] . '</translation>');
- while ($reader->read()) {
- if (in_array($reader->name, array('translation', '#text'))) {
- continue;
- }
- $count++;
- }
- if (!isset($xliff_validation[$id]) || $xliff_validation[$id] != $count) {
- $job_item->addMessage('Failed to validate semantic integrity of %key element. Please check also the HTML code of the element in the review process.',
- array('%key' => tmgmt_ensure_keys_string($array_key)));
- }
- }
- // Validation successful.
- return $job;
- }
- /**
- * Returns the simple XMLElement object.
- *
- * @param string $imported_file
- * Path to a file or an XML string to import.
- * @param bool $is_file
- * (optional) Whether $imported_file is the path to a file or not.
- *
- * @return bool|\SimpleXMLElement
- * The parsed SimpleXMLElement object. FALSE in case of failed parsing.
- */
- protected function getImportedXML($imported_file, $is_file = TRUE) {
- if (empty($this->importedXML)) {
- // It is not possible to load the file directly with simplexml as it gets
- // url encoded due to the temporary://. This is a PHP bug, see
- // https://bugs.php.net/bug.php?id=61469
- if ($is_file) {
- $imported_file = file_get_contents($imported_file);
- }
- if (!($this->importedXML = simplexml_load_string($imported_file))) {
- return FALSE;
- }
- // Register the XLIFF namespace, required for xpath.
- $this->importedXML->registerXPathNamespace('xliff', 'urn:oasis:names:tc:xliff:document:1.2');
- }
- return $this->importedXML;
- }
- protected function getImportedTargets(TMGMTJob $job) {
- if (empty($this->importedXML)) {
- return FALSE;
- }
- if (empty($this->importedTransUnits)) {
- $reader = new XMLReader();
- foreach ($this->importedXML->xpath('//xliff:trans-unit') as $unit) {
- if (!$job->getSetting('xliff_processing')) {
- $this->importedTransUnits[(string) $unit['id']]['#text'] = (string) $unit->target;
- continue;
- }
- $reader->XML($unit->target->asXML());
- $reader->read();
- $this->importedTransUnits[(string) $unit['id']]['#text'] =
- $this->processForImport($reader->readInnerXML(), $job);
- }
- }
- return $this->importedTransUnits;
- }
- /**
- * Processes trans-unit/target to rebuild back the HTML.
- *
- * @param string $translation
- * Job data array.
- * @param TMGMTJob $job
- * Translation job.
- *
- * @return string
- */
- protected function processForImport($translation, TMGMTJob $job) {
- // In case we do not want to do xliff processing return the translation as
- // is.
- if (!$job->getSetting('xliff_processing')) {
- return $translation;
- }
- $reader = new XMLReader();
- $reader->XML('<translation>' . $translation . '</translation>');
- $text = '';
- while ($reader->read()) {
- // If the current element is text append it to the result text.
- if ($reader->name == '#text' || $reader->name == '#cdata-section') {
- $text .= $reader->value;
- }
- elseif ($reader->name == 'x') {
- if ($reader->getAttribute('ctype') == 'lb') {
- $text .= '<br />';
- }
- }
- elseif ($reader->name == 'ph') {
- if ($reader->getAttribute('ctype') == 'image') {
- $text .= '<img';
- while ($reader->moveToNextAttribute()) {
- // @todo - we have to use x-html: prefixes for attributes.
- if ($reader->name != 'ctype' && $reader->name != 'id') {
- $text .= " {$reader->name}=\"{$reader->value}\"";
- }
- }
- $text .= ' />';
- }
- }
- }
- return $text;
- }
- /**
- * Helper function to process the source text.
- *
- * @param string $source
- * Job data array.
- * @param array $key_array
- * The source item data key.
- *
- * @return string
- */
- protected function processForExport($source, array $key_array) {
- $tjiid = $key_array[0];
- $key_string = tmgmt_ensure_keys_string($key_array);
- // The reason why we use DOMDocument object here and not just XMLReader
- // is the DOMDocument's ability to deal with broken HTML.
- $dom = new DOMDocument();
- // We need to append the head with encoding so that special characters
- // are read correctly.
- $dom->loadHTML("<html><head><meta http-equiv='Content-type' content='text/html; charset=UTF-8' /></head><body>" . $source . '</body></html>');
- $iterator = new RecursiveIteratorIterator(
- new RecursiveDOMIterator($dom),
- RecursiveIteratorIterator::SELF_FIRST);
- $writer = new XMLWriter();
- $writer->openMemory();
- $writer->startDocument('1.0', 'UTF-8');
- $writer->startElement('wrapper');
- $tray = array();
- $non_pair_tags = array('br', 'img');
- if (!isset($this->job->settings['xliff_validation'])) {
- $this->job->settings['xliff_validation'] = array();
- }
- $xliff_validation = $this->job->settings['xliff_validation'];
- /** @var DOMElement $node */
- foreach ($iterator as $node) {
- if (in_array($node->nodeName, array('html', 'body', 'head', 'meta'))) {
- continue;
- }
- if ($node->nodeType === XML_ELEMENT_NODE) {
- // Increment the elements count and compose element id.
- if (!isset($xliff_validation[$key_string])) {
- $xliff_validation[$key_string] = 0;
- }
- $xliff_validation[$key_string]++;
- $id = 'tjiid' . $tjiid . '-' . $xliff_validation[$key_string];
- $is_pair_tag = !in_array($node->nodeName, $non_pair_tags);
- if ($is_pair_tag) {
- $this->writeBPT($writer, $node, $id);
- }
- elseif ($node->nodeName == 'img') {
- $this->writeIMG($writer, $node, $id);
- }
- elseif ($node->nodeName == 'br') {
- $this->writeBR($writer, $node, $id);
- }
- // Add to tray new element info.
- $tray[$id] = array(
- 'name' => $node->nodeName,
- 'id' => $id,
- 'value' => $node->nodeValue,
- 'built_text' => '',
- 'is_pair_tag' => $is_pair_tag,
- );
- }
- // The current node is a text.
- elseif ($node->nodeName == '#text') {
- // Add the node value to the text output.
- $writer->writeCdata($this->toEntities($node->nodeValue));
- foreach ($tray as &$info) {
- $info['built_text'] .= $node->nodeValue;
- }
- }
- // Reverse so that pair tags are closed in the expected order.
- $reversed_tray = array_reverse($tray);
- foreach ($reversed_tray as $_info) {
- // If the build_text equals to the node value and it is not a pair tag
- // add the end pair tag markup.
- if ($_info['value'] == $_info['built_text'] && $_info['is_pair_tag']) {
- // Count also for the closing elements.
- $xliff_validation[$key_string]++;
- $this->writeEPT($writer, $_info['name'], $_info['id']);
- // When the end pair tag has been written unset the element info
- // from the tray.
- unset($tray[$_info['id']]);
- }
- }
- }
- // Set the xliff_validation data and save the job.
- $this->job->settings['xliff_validation'] = $xliff_validation;
- $this->job->save();
- $writer->endElement();
- // Load the output with XMLReader so that we can easily get the inner xml.
- $reader = new XMLReader();
- $reader->XML($writer->outputMemory());
- $reader->read();
- return $reader->readInnerXML();
- }
- /**
- * Writes br tag.
- *
- * @param XMLWriter $writer
- * Writer that writes the output.
- * @param DOMElement $node
- * Current node.
- * @param $id
- * Current node id.
- */
- protected function writeBR(XMLWriter $writer, DOMElement $node, $id) {
- $writer->startElement('x');
- $writer->writeAttribute('id', $id);
- $writer->writeAttribute('ctype', 'lb');
- $writer->endElement();
- }
- /**
- * Writes beginning pair tag.
- *
- * @param XMLWriter $writer
- * Writer that writes the output.
- * @param DOMElement $node
- * Current node.
- * @param $id
- * Current node id.
- */
- protected function writeBPT(XMLWriter $writer, DOMElement $node, $id) {
- $beginning_tag = '<' . $node->nodeName;
- if ($node->hasAttributes()) {
- $attributes = array();
- /** @var DOMAttr $attribute */
- foreach ($node->attributes as $attribute) {
- $attributes[] = $attribute->name . '="' . $attribute->value . '"';
- }
- $beginning_tag .= ' '. implode(' ', $attributes);
- }
- $beginning_tag .= '>';
- $writer->startElement('bpt');
- $writer->writeAttribute('id', $id);
- $writer->text($beginning_tag);
- $writer->endElement();
- }
- /**
- * Writes ending pair tag.
- *
- * @param XMLWriter $writer
- * Writer that writes the output.
- * @param string $name
- * Ending tag name.
- * @param $id
- * Current node id.
- */
- protected function writeEPT(XMLWriter $writer, $name, $id) {
- $writer->startElement('ept');
- $writer->writeAttribute('id', $id);
- $writer->text('</' . $name . '>');
- $writer->endElement();
- }
- /**
- * Writes img tag.
- *
- * Note that alt and title attributes are not written as sub elements as
- * Trados studio is not able to deal with two sub elements at one level.
- *
- * @param XMLWriter $writer
- * Writer that writes the output.
- * @param DOMElement $node
- * Current node.
- * @param $id
- * Current node id.
- */
- protected function writeIMG(XMLWriter $writer, DOMElement $node, $id) {
- $writer->startElement('ph');
- $writer->writeAttribute('id', $id);
- $writer->writeAttribute('ctype', 'image');
- foreach ($node->attributes as $attribute) {
- // @todo - uncomment when issue with Trados/sub elements fixed.
- /*
- if (in_array($attribute->name, array('title', 'alt'))) {
- continue;
- }
- */
- $writer->writeAttribute($attribute->name, $attribute->value);
- }
- /*
- if ($alt_attribute = $node->getAttribute('alt')) {
- $writer->startElement('sub');
- $writer->writeAttribute('id', $id . '-img-alt');
- $writer->writeAttribute('ctype', 'x-img-alt');
- $writer->text($alt_attribute);
- $writer->endElement();
- $this->elementsCount++;
- }
- if ($title_attribute = $node->getAttribute('title')) {
- $writer->startElement('sub');
- $writer->writeAttribute('id', $id . '-img-title');
- $writer->writeAttribute('ctype', 'x-img-title');
- $writer->text($title_attribute);
- $writer->endElement();
- $this->elementsCount++;
- }
- */
- $writer->endElement();
- }
- /**
- * Convert critical characters to HTML entities.
- *
- * DOMDocument will convert HTML entities to its actual characters. This can
- * lead into situation when not allowed characters will appear in the content.
- *
- * @param string $string
- * String to escape.
- *
- * @return string
- * Escaped string.
- */
- protected function toEntities($string) {
- return str_replace(array('&', '>', '<'), array('&', '>', '<'), $string);
- }
- }
|