123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510 |
- <?php
- /**
- * @file
- * Provides the base class for FeedsXPathParserHTML and FeedsXPathParserXML.
- */
- /**
- * Base class for the HTML and XML parsers.
- */
- abstract class FeedsXPathParserBase extends FeedsParser {
- protected $modified_queries = array();
- protected $rawXML = array();
- protected $doc = NULL;
- protected $xpath = NULL;
- /**
- * Classes that use FeedsXPathParserBase must implement this.
- *
- * @param array $source_config
- * The configuration for the source.
- * @param FeedsFetcherResult $fetcher_result
- * A FeedsFetcherResult object.
- *
- * @return DOMDocument
- * The DOMDocument to perform XPath queries on.
- */
- abstract protected function setup($source_config, FeedsFetcherResult $fetcher_result);
- /**
- * Implements FeedsParser::parse().
- */
- public function parse(FeedsSource $source, FeedsFetcherResult $fetcher_result) {
- $source_config = $source->getConfigFor($this);
- if (empty($source_config)) {
- $source_config = $this->getConfig();
- }
- $this->doc = $this->setup($source_config, $fetcher_result);
- $parser_result = new FeedsParserResult();
- $mappings = $this->getOwnMappings();
- $this->rawXML = array_keys(array_filter($source_config['rawXML']));
- // Set link.
- $fetcher_config = $source->getConfigFor($source->importer->fetcher);
- $parser_result->link = $fetcher_config['source'];
- $this->xpath = new FeedsXPathParserDOMXPath($this->doc);
- $config = array();
- $config['debug'] = array_keys(array_filter($source_config['exp']['debug']));
- $config['errors'] = $source_config['exp']['errors'];
- $this->xpath->setConfig($config);
- $all_nodes = $this->xpath->namespacedQuery($source_config['context'], NULL, 'context');
- foreach ($all_nodes as $node) {
- $parsed_item = $variables = array();
- foreach ($source_config['sources'] as $source => $query) {
- // Variable substitution.
- $query = strtr($query, $variables);
- // Parse the item.
- $result = $this->parseSourceElement($query, $node, $source);
- if (isset($result)) {
- if (!is_array($result)) {
- $variables['$' . $mappings[$source]] = $result;
- }
- else {
- $variables['$' . $mappings[$source]] = '';
- }
- $parsed_item[$source] = $result;
- }
- }
- if (!empty($parsed_item)) {
- $parser_result->items[] = $parsed_item;
- }
- }
- unset($this->doc);
- unset($this->xpath);
- return $parser_result;
- }
- /**
- * Parses one item from the context array.
- *
- * @param $item
- * A SimpleXMLElement from the context array.
- *
- * @param $query
- * An XPath query.
- *
- * @param $source
- * The name of the source for this query.
- *
- * @return array
- * An array containing the results of the query.
- */
- protected function parseSourceElement($query, $context, $source) {
- if (empty($query)) {
- return;
- }
- $node_list = $this->xpath->namespacedQuery($query, $context, $source);
- /**
- * Iterate through the results of the XPath query. If this source is
- * configured to return raw xml, make it so.
- */
- if ($node_list instanceof DOMNodeList) {
- $results = array();
- if (in_array($source, $this->rawXML)) {
- foreach ($node_list as $node) {
- $results[] = $this->getRaw($node);
- }
- }
- else {
- foreach ($node_list as $node) {
- $results[] = $node->nodeValue;
- }
- }
- // Return single result if so.
- if (count($results) === 1) {
- return $results[0];
- }
- // Empty result returns NULL, that way we can check.
- elseif (empty($results)) {
- return;
- }
- else {
- return $results;
- }
- }
- // A value was returned directly from namespacedQuery().
- else {
- return $node_list;
- }
- }
- /**
- * Source form.
- */
- public function sourceForm($source_config) {
- $form = array();
- $importer = feeds_importer($this->id);
- $importer_config = $importer->getConfig();
- $mappings_ = $importer_config['processor']['config']['mappings'];
- if (empty($source_config)) {
- $source_config = $this->getConfig();
- }
- if (isset($source_config['allow_override']) &&
- !$source_config['allow_override'] &&
- empty($source_config['config'])) {
- return;
- }
- // Add extensions that might get importerd.
- $allowed_extensions = isset($importer_config['fetcher']['config']['allowed_extensions']) ? $importer_config['fetcher']['config']['allowed_extensions'] : FALSE;
- if ($allowed_extensions) {
- if (strpos($allowed_extensions, 'html') === FALSE) {
- $importer->fetcher->config['allowed_extensions'] .= ' html htm';
- }
- }
- $uniques = $mappings = array();
- foreach ($mappings_ as $mapping) {
- if (strpos($mapping['source'], 'xpathparser:') === 0) {
- $mappings[$mapping['source']] = $mapping['target'];
- if ($mapping['unique']) {
- $uniques[] = $mapping['target'];
- }
- }
- }
- $form['xpath'] = array(
- '#type' => 'fieldset',
- '#tree' => TRUE,
- '#title' => t('XPath Parser Settings'),
- '#collapsible' => TRUE,
- '#collapsed' => TRUE,
- );
- if (empty($mappings)) {
- // Detect if Feeds menu structure has changed. This will take a while to be
- // released, but since I run dev it needs to work.
- $feeds_menu = feeds_ui_menu();
- if (isset($feeds_menu['admin/structure/feeds/list'])) {
- $feeds_base = 'admin/structure/feeds/edit/';
- }
- else {
- $feeds_base = 'admin/structure/feeds/';
- }
- $form['xpath']['error_message']['#markup'] = '<div class="help">' . t('No XPath mappings are defined. Define mappings !link.', array('!link' => l(t('here'), $feeds_base . $this->id . '/mapping'))) . '</div><br />';
- return $form;
- }
- $form['xpath']['context'] = array(
- '#type' => 'textfield',
- '#title' => t('Context'),
- '#required' => TRUE,
- '#description' => t('This is the base query, all other queries will run in this context.'),
- '#default_value' => isset($source_config['context']) ? $source_config['context'] : '',
- '#maxlength' => 1024,
- '#size' => 80,
- );
- $form['xpath']['sources'] = array(
- '#type' => 'fieldset',
- '#tree' => TRUE,
- );
- if (!empty($uniques)) {
- $items = array(
- format_plural(count($uniques),
- t('Field <strong>!column</strong> is mandatory and considered unique: only one item per !column value will be created.',
- array('!column' => implode(', ', $uniques))),
- t('Fields <strong>!columns</strong> are mandatory and values in these columns are considered unique: only one entry per value in one of these columns will be created.',
- array('!columns' => implode(', ', $uniques)))),
- );
- $form['xpath']['sources']['help']['#markup'] = '<div class="help">' . theme('item_list', array('items' => $items)) . '</div>';
- }
- $variables = array();
- foreach ($mappings as $source => $target) {
- $form['xpath']['sources'][$source] = array(
- '#type' => 'textfield',
- '#title' => check_plain($target),
- '#description' => t('The XPath query to run.'),
- '#default_value' => isset($source_config['sources'][$source]) ? $source_config['sources'][$source] : '',
- '#maxlength' => 1024,
- '#size' => 80,
- );
- if (!empty($variables)) {
- $variable_text = format_plural(count($variables),
- t('The variable ' . implode(', ', $variables) . ' is available for replacement.'),
- t('The variables ' . implode(', ', $variables) . ' are available for replacement.')
- );
- $form['xpath']['sources'][$source]['#description'] .= '<br />' . $variable_text;
- }
- $variables[] = '$' . $target;
- }
- $form['xpath']['rawXML'] = array(
- '#type' => 'checkboxes',
- '#title' => t('Select the queries you would like to return raw XML or HTML'),
- '#options' => $mappings,
- '#default_value' => isset($source_config['rawXML']) ? $source_config['rawXML'] : array(),
- );
- $form['xpath']['exp'] = array(
- '#type' => 'fieldset',
- '#collapsible' => TRUE,
- '#collapsed' => TRUE,
- '#tree' => TRUE,
- '#title' => t('Debug Options'),
- );
- $form['xpath']['exp']['errors'] = array(
- '#type' => 'checkbox',
- '#title' => t('Show error messages.'),
- '#default_value' => isset($source_config['exp']['errors']) ? $source_config['exp']['errors'] : FALSE,
- );
- if (extension_loaded('tidy')) {
- $form['xpath']['exp']['tidy'] = array(
- '#type' => 'checkbox',
- '#title' => t('Use Tidy'),
- '#description' => t('The Tidy PHP extension has been detected.
- Select this to clean the markup before parsing.'),
- '#default_value' => isset($source_config['exp']['tidy']) ? $source_config['exp']['tidy'] : FALSE,
- );
- $form['xpath']['exp']['tidy_encoding'] = array(
- '#type' => 'textfield',
- '#title' => t('Tidy encoding'),
- '#description' => t('Set the encoding for tidy. See the !phpdocs for possible values.', array('!phpdocs' => l(t('PHP docs'), 'http://www.php.net/manual/en/tidy.parsestring.php/'))),
- '#default_value' => isset($source_config['exp']['tidy_encoding']) ? $source_config['exp']['tidy_encoding'] : 'UTF8',
- '#states' => array(
- 'visible' => array(
- ':input[name$="[tidy]"]' => array(
- 'checked' => TRUE,
- ),
- ),
- ),
- );
- }
- $form['xpath']['exp']['debug'] = array(
- '#type' => 'checkboxes',
- '#title' => t('Debug query'),
- '#options' => array_merge(array('context' => 'context'), $mappings),
- '#default_value' => isset($source_config['exp']['debug']) ? $source_config['exp']['debug'] : array(),
- );
- return $form;
- }
- /**
- * Override parent::configForm().
- */
- public function configForm(&$form_state) {
- $config = $this->getConfig();
- $config['config'] = TRUE;
- $form = $this->sourceForm($config);
- $form['xpath']['context']['#required'] = FALSE;
- $form['xpath']['#collapsed'] = FALSE;
- $form['xpath']['allow_override'] = array(
- '#type' => 'checkbox',
- '#title' => t('Allow source configuration override'),
- '#description' => t('This setting allows feed nodes to specify their own XPath values for the context and sources.'),
- '#default_value' => $config['allow_override'],
- );
- return $form;
- }
- /**
- * Define defaults.
- */
- public function sourceDefaults() {
- return array();
- }
- /**
- * Define defaults.
- */
- public function configDefaults() {
- return array(
- 'sources' => array(),
- 'rawXML' => array(),
- 'context' => '',
- 'exp' => array(
- 'errors' => FALSE,
- 'tidy' => FALSE,
- 'debug' => array(),
- 'tidy_encoding' => 'UTF8',
- ),
- 'allow_override' => TRUE,
- );
- }
- /**
- * Override parent::sourceFormValidate().
- *
- * If the values of this source are the same as the base config we set them to
- * blank to that the values will be inherited from the importer defaults.
- *
- * @param &$values
- * The values from the form to validate, passed by reference.
- */
- public function sourceFormValidate(&$values) {
- $config = $this->getConfig();
- $values = $values['xpath'];
- $allow_override = $config['allow_override'];
- unset($config['allow_override']);
- ksort($values);
- ksort($config);
- if ($values === $config || !$allow_override) {
- $values = array();
- return;
- }
- $this->configFormValidate($values);
- }
- /**
- * Override parent::sourceFormValidate().
- */
- public function configFormValidate(&$values) {
- $mappings = $this->getOwnMappings();
- // This tests if we're validating configForm or sourceForm.
- $config_form = FALSE;
- if (isset($values['xpath'])) {
- $values = $values['xpath'];
- $config_form = TRUE;
- }
- $class = get_class($this);
- $xml = new SimpleXMLElement('<?xml version="1.0" encoding="UTF-8"?>' . "\n<items></items>");
- $use_errors = libxml_use_internal_errors(TRUE);
- $values['context'] = trim($values['context']);
- if (!empty($values['context'])) {
- $result = $xml->xpath($values['context']);
- }
- $error = libxml_get_last_error();
- // Error code 1219 is undefined namespace prefix.
- // Our sample doc doesn't have any namespaces let alone the one they're
- // trying to use. Besides, if someone is trying to use a namespace in an
- // XPath query, they're probably right.
- if ($error && $error->code != 1219) {
- $element = 'feeds][' . $class . '][xpath][context';
- if ($config_form) {
- $element = 'xpath][context';
- }
- form_set_error($element, t('There was an error with the XPath selector: %error', array('%error' => $error->message)));
- libxml_clear_errors();
- }
- foreach ($values['sources'] as $key => &$query) {
- $query = trim($query);
- if (!empty($query)) {
- $result = $xml->xpath($query);
- $error = libxml_get_last_error();
- if ($error && $error->code != 1219) {
- $variable_present = FALSE;
- // Our variable substitution options can cause syntax errors, check
- // if we're doing that.
- if ($error->code == 1207) {
- foreach ($mappings as $target) {
- if (strpos($query, '$' . $target) !== FALSE) {
- $variable_present = TRUE;
- }
- }
- }
- if (!$variable_present) {
- $element = 'feeds][' . $class . '][xpath][sources][' . $key;
- if ($config_form) {
- $element = 'xpath][sources][' . $key;
- }
- form_set_error($element, t('There was an error with the XPath selector: %error', array('%error' => $error->message)));
- libxml_clear_errors();
- }
- }
- }
- }
- libxml_use_internal_errors($use_errors);
- }
- /**
- * Override parent::getMappingSources().
- */
- public function getMappingSources() {
- $mappings = $this->filterMappings(feeds_importer($this->id)->processor->config['mappings']);
- $next = 0;
- if (!empty($mappings)) {
- $keys = array_keys($mappings);
- $last_mapping = end($keys);
- $next = explode(':', $last_mapping);
- $next = $next[1] + 1;
- }
- return array(
- 'xpathparser:' . $next => array(
- 'name' => t('XPath Expression'),
- 'description' => t('Allows you to configure an XPath expression that will populate this field.'),
- ),
- ) + parent::getMappingSources();
- }
- /**
- * Get the mappings that belong to us i.e. mappings that begin with
- * "xpathparser:".
- *
- * @return array
- * An array of mappings keyed source => target.
- */
- protected function getOwnMappings() {
- $importer_config = feeds_importer($this->id)->getConfig();
- return $this->filterMappings($importer_config['processor']['config']['mappings']);
- }
- /**
- * Filters mappings, returning the ones that belong to us.
- *
- * @param array $mappings
- * A mapping array from a processor.
- *
- * @return array
- * An array of mappings keyed source => target.
- */
- protected function filterMappings($mappings) {
- $our_mappings = array();
- foreach ($mappings as $mapping) {
- if (strpos($mapping['source'], 'xpathparser:') === 0) {
- $our_mappings[$mapping['source']] = $mapping['target'];
- }
- }
- return $our_mappings;
- }
- /**
- * Start custom error handling.
- *
- * @return bool
- * The previous value of use_errors.
- */
- protected function errorStart() {
- return libxml_use_internal_errors(TRUE);
- }
- /**
- * Stop custom error handling.
- *
- * @param bool $use
- * The previous value of use_errors.
- * @param bool $print
- * (Optional) Whether to print errors to the screen. Defaults to TRUE.
- */
- protected function errorStop($use, $print = TRUE) {
- if ($print) {
- foreach (libxml_get_errors() as $error) {
- switch ($error->level) {
- case LIBXML_ERR_WARNING:
- case LIBXML_ERR_ERROR:
- $type = 'warning';
- break;
- case LIBXML_ERR_FATAL:
- $type = 'error';
- break;
- }
- $message = t('%error on line %num. Error code: %code', array('%error' => trim($error->message), '%num' => $error->line, '%code' => $error->code));
- drupal_set_message($message, $type, FALSE);
- }
- }
- libxml_clear_errors();
- libxml_use_internal_errors($use);
- }
- abstract protected function getRaw(DOMNode $node);
- }
|