<?php

/**
 * @file
 * Converts non-latin text to US-ASCII and sanitizes file names.
 *
 * @author Stefan M. Kudwien (http://drupal.org/user/48898)
 */

/**
 * Implements hook_menu().
 */
function transliteration_menu() {
  $items['admin/config/media/file-system/settings'] = array(
    'title' => 'Settings',
    'file path' => drupal_get_path('module', 'system'),
    'weight' => -10,
    'type' => MENU_DEFAULT_LOCAL_TASK,
  );
  $items['admin/config/media/file-system/transliteration'] = array(
    'title' => 'Transliteration',
    'description' => 'Convert existing file names to US-ASCII.',
    'page callback' => 'drupal_get_form',
    'page arguments' => array('transliteration_retroactive'),
    'access arguments' => array('administer site configuration'),
    'file' => 'transliteration.admin.inc',
    'weight' => 10,
    'type' => MENU_LOCAL_TASK,
  );
  return $items;
}

/**
 * Implements hook_form_FORM_ID_alter().
 *
 * Adds transliteration settings to the file system configuration form.
 */
function transliteration_form_system_file_system_settings_alter(&$form, &$form_state) {
  $form['transliteration'] = array(
    '#type' => 'item',
    '#title' => t('Transliteration'),
    '#value' => '',
  );
  $form['transliteration']['transliteration_file_uploads'] = array(
    '#type' => 'checkbox',
    '#title' => t('Transliterate file names during upload.'),
    '#description' => t('Enable to convert file names to US-ASCII character set for cross-platform compatibility.'),
    '#default_value' => variable_get('transliteration_file_uploads', TRUE),
  );
  $form['transliteration']['transliteration_file_uploads_display_name'] = array(
    '#type' => 'checkbox',
    '#title' => t('Transliterate the displayed file name.'),
    '#description' => t('Enable to also convert the file name that is displayed within the site (for example, in link text).'),
    '#default_value' => variable_get('transliteration_file_uploads_display_name', TRUE),
    '#states' => array(
      'invisible' => array(
        'input[name="transliteration_file_uploads"]' => array('checked' => FALSE),
      ),
    ),
  );
  $form['transliteration']['transliteration_file_lowercase'] = array(
    '#type' => 'checkbox',
    '#title' => t('Lowercase transliterated file names.'),
    '#default_value' => variable_get('transliteration_file_lowercase', TRUE),
    '#description' => t('This is a recommended setting to prevent issues with case-insensitive file systems. It has no effect if transliteration has been disabled.'),
    '#states' => array(
      'invisible' => array(
        'input[name="transliteration_file_uploads"]' => array('checked' => FALSE),
      ),
    ),
  );
  $form['buttons']['#weight'] = 1;
}

/**
 * Implements hook_form_FORM_ID_alter().
 *
 * Adds transliteration settings to the search settings form.
 */
function transliteration_form_search_admin_settings_alter(&$form, &$form_state) {
  $form['transliteration'] = array(
    '#type' => 'fieldset',
    '#title' => t('Transliteration'),
  );
  $form['transliteration']['transliteration_search'] = array(
    '#type' => 'checkbox',
    '#title' => t('Transliterate search index and searched strings.'),
    '#description' => t('Enable to allow searching and indexing using US-ASCII character set, i.e. to treat accented and unaccented letters the same.'),
    '#default_value' => variable_get('transliteration_search', TRUE),
  );
}

/**
 * Transliterates and sanitizes a file name.
 *
 * The resulting file name has white space replaced with underscores, consists
 * of only US-ASCII characters, and is converted to lowercase (if configured).
 * If multiple files have been submitted as an array, the names will be
 * processed recursively.
 *
 * @param $filename
 *   A file name, or an array of file names.
 * @param $source_langcode
 *   Optional ISO 639 language code that denotes the language of the input and
 *   is used to apply language-specific variations. If the source language is
 *   not known at the time of transliteration, it is recommended to set this
 *   argument to the site default language to produce consistent results.
 *   Otherwise the current display language will be used.
 * @return
 *   Sanitized file name, or array of sanitized file names.
 *
 * @see language_default()
 */
function transliteration_clean_filename($filename, $source_langcode = NULL) {
  if (is_array($filename)) {
    foreach ($filename as $key => $value) {
      $filename[$key] = transliteration_clean_filename($value, $source_langcode);
    }
    return $filename;
  }

  // Allow other modules to alter the filename prior to processing.
  drupal_alter('transliteration_clean_filename_prepare', $filename, $source_langcode);
  $filename = transliteration_get($filename, '', $source_langcode);
  // Replace whitespace.
  $filename = str_replace(' ', '_', $filename);
  // Remove remaining unsafe characters.
  $filename = preg_replace('![^0-9A-Za-z_.-]!', '', $filename);
  // Remove multiple consecutive non-alphabetical characters.
  $filename = preg_replace('/(_)_+|(\.)\.+|(-)-+/', '\\1\\2\\3', $filename);
  // Force lowercase to prevent issues on case-insensitive file systems.
  if (variable_get('transliteration_file_lowercase', TRUE)) {
    $filename = strtolower($filename);
  }
  return $filename;
}

/**
 * Transliterates text.
 *
 * Takes an input string in any language and character set, and tries to
 * represent it in US-ASCII characters by conveying, in Roman letters, the
 * pronunciation expressed by the text in some other writing system.
 *
 * @param $text
 *   UTF-8 encoded text input.
 * @param $unknown
 *   Replacement string for characters that do not have a suitable ASCII
 *   equivalent.
 * @param $source_langcode
 *   Optional ISO 639 language code that denotes the language of the input and
 *   is used to apply language-specific variations. If the source language is
 *   not known at the time of transliteration, it is recommended to set this
 *   argument to the site default language to produce consistent results.
 *   Otherwise the current display language will be used.
 * @return
 *   Transliterated text.
 *
 * @see language_default()
 */
function transliteration_get($text, $unknown = '?', $source_langcode = NULL) {
  if (!function_exists('_transliteration_process')) {
    module_load_include('inc', 'transliteration');
  }
  return _transliteration_process($text, $unknown, $source_langcode);
}

/**
 * Implements hook_init().
 *
 * Sanitizes file names during upload.
 */
function transliteration_init() {
  if (!empty($_FILES['files']) && variable_get('transliteration_file_uploads', TRUE)) {
    // Figure out language, which is available in $_POST['language'] for node
    // forms.
    $langcode = NULL;
    if (!empty($_POST['language'])) {
      $languages = language_list();
      if (isset($languages[$_POST['language']])) {
        $langcode = $_POST['language'];
      }
    }
    foreach ($_FILES['files']['name'] as $field => $filename) {
      // Keep a copy of the unaltered file name.
      $_FILES['files']['orig_name'][$field] = $filename;
      $_FILES['files']['name'][$field] = transliteration_clean_filename($filename, $langcode);
    }
  }
}

/**
 * Implements hook_file_presave().
 */
function transliteration_file_presave($file) {
  // If an uploaded file had its name altered in transliteration_init() and if
  // the human-readable display name is not being transliterated, restore the
  // original version as the human-readable name before saving. (The
  // transliterated version will still be used in the file URI, which is the
  // only place where it matters.)
  if (!empty($_FILES['files']['name']) && variable_get('transliteration_file_uploads', TRUE) && !variable_get('transliteration_file_uploads_display_name', TRUE)) {
    $key = array_search($file->filename, $_FILES['files']['name']);
    if ($key !== FALSE && isset($_FILES['files']['orig_name'][$key])) {
      $file->filename = $_FILES['files']['orig_name'][$key];
    }
  }
}

/**
 * Implements hook_search_preprocess().
 *
 * Transliterates text added to the search index and user submitted search
 * keywords.
 */
function transliteration_search_preprocess($text) {
  if (variable_get('transliteration_search', TRUE)) {
    return transliteration_get($text, '', language_default('language'));
  }
  return $text;
}

/**
 * Implements hook_filter_info().
 */
function transliteration_filter_info() {
  return array(
    'transliteration' => array(
      'title' => t('Convert all characters to US-ASCII'),
      'process callback' => '_transliteration_filter_process',
      'settings callback' => '_transliteration_filter_settings',
      'default settings' => array(
        'no_known_transliteration' => '?'
      ),
      'tips callback' => '_transliteration_filter_tips',
    ),
  );
}

/**
 * Process callback for the transliteration filter.
 */
function _transliteration_filter_process($text, $filter, $format, $langcode, $cache, $cache_id) {
  return transliteration_get($text, $filter->settings['no_known_transliteration'], $langcode);
}

/**
 * Settings callback for the transliteration filter.
 */
function _transliteration_filter_settings($form, &$form_state, $filter, $format, $defaults, $filters) {
  $filter->settings += $defaults;

  return array(
    'no_known_transliteration' => array(
      '#type' => 'textfield',
      '#title' => t('Placeholder for characters with no known US-ASCII equivalent'),
      '#size' => 2,
      // The maximum length is 5 in order to accommodate unicode multibyte input.
      '#maxlength' => 5,
      '#default_value' => $filter->settings['no_known_transliteration'],
    )
  );
}

/**
 * Filter tips callback for the transliteration filter.
 */
function _transliteration_filter_tips($filter, $format, $long) {
  return t('Non-latin text (e.g., å, ö, 漢) will be converted to US-ASCII equivalents (a, o, ?).');
}