processor_tokenizer.inc 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. <?php
  2. /**
  3. * @file
  4. * Contains SearchApiTokenizer.
  5. */
  6. /**
  7. * Processor for tokenizing fulltext data by replacing (configurable)
  8. * non-letters with spaces.
  9. */
  10. class SearchApiTokenizer extends SearchApiAbstractProcessor {
  11. /**
  12. * @var string
  13. */
  14. protected $spaces;
  15. /**
  16. * @var string
  17. */
  18. protected $ignorable;
  19. public function configurationForm() {
  20. $form = parent::configurationForm();
  21. // Only make fulltext fields available as options.
  22. $fields = $this->index->getFields();
  23. $field_options = array();
  24. foreach ($fields as $name => $field) {
  25. if (empty($field['real_type']) && search_api_is_text_type($field['type'])) {
  26. $field_options[$name] = $field['name'];
  27. }
  28. }
  29. $form['fields']['#options'] = $field_options;
  30. $form += array(
  31. 'spaces' => array(
  32. '#type' => 'textfield',
  33. '#title' => t('Whitespace characters'),
  34. '#description' => t('Specify the characters that should be regarded as whitespace and therefore used as word-delimiters. ' .
  35. 'Specify the characters as a <a href="@link">PCRE character class</a>. ' .
  36. 'Note: For non-English content, the default setting might not be suitable.',
  37. array('@link' => url('http://www.php.net/manual/en/regexp.reference.character-classes.php'))),
  38. '#default_value' => "[^[:alnum:]]",
  39. ),
  40. 'ignorable' => array(
  41. '#type' => 'textfield',
  42. '#title' => t('Ignorable characters'),
  43. '#description' => t('Specify characters which should be removed from fulltext fields and search strings (e.g., "-"). The same format as above is used.'),
  44. '#default_value' => "[']",
  45. ),
  46. );
  47. if (!empty($this->options)) {
  48. $form['spaces']['#default_value'] = $this->options['spaces'];
  49. $form['ignorable']['#default_value'] = $this->options['ignorable'];
  50. }
  51. return $form;
  52. }
  53. public function configurationFormValidate(array $form, array &$values, array &$form_state) {
  54. parent::configurationFormValidate($form, $values, $form_state);
  55. $spaces = str_replace('/', '\/', $values['spaces']);
  56. $ignorable = str_replace('/', '\/', $values['ignorable']);
  57. if (@preg_match('/(' . $spaces . ')+/u', '') === FALSE) {
  58. $el = $form['spaces'];
  59. form_error($el, $el['#title'] . ': ' . t('The entered text is no valid regular expression.'));
  60. }
  61. if (@preg_match('/(' . $ignorable . ')+/u', '') === FALSE) {
  62. $el = $form['ignorable'];
  63. form_error($el, $el['#title'] . ': ' . t('The entered text is no valid regular expression.'));
  64. }
  65. }
  66. protected function processFieldValue(&$value) {
  67. $this->prepare();
  68. if ($this->ignorable) {
  69. $value = preg_replace('/(' . $this->ignorable . ')+/u', '', $value);
  70. }
  71. if ($this->spaces) {
  72. $arr = preg_split('/(' . $this->spaces . ')+/u', $value);
  73. if (count($arr) > 1) {
  74. $value = array();
  75. foreach ($arr as $token) {
  76. $value[] = array('value' => $token);
  77. }
  78. }
  79. }
  80. }
  81. protected function process(&$value) {
  82. // We don't touch integers, NULL values or the like.
  83. if (is_string($value)) {
  84. $this->prepare();
  85. if ($this->ignorable) {
  86. $value = preg_replace('/' . $this->ignorable . '+/u', '', $value);
  87. }
  88. if ($this->spaces) {
  89. $value = preg_replace('/' . $this->spaces . '+/u', ' ', $value);
  90. }
  91. }
  92. }
  93. protected function prepare() {
  94. if (!isset($this->spaces)) {
  95. $this->spaces = str_replace('/', '\/', $this->options['spaces']);
  96. $this->ignorable = str_replace('/', '\/', $this->options['ignorable']);
  97. }
  98. }
  99. }