FeedsXPathParserBase.inc 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. <?php
  2. /**
  3. * @file
  4. * Provides the base class for FeedsXPathParserHTML and FeedsXPathParserXML.
  5. */
  6. /**
  7. * Base class for the HTML and XML parsers.
  8. */
  9. abstract class FeedsXPathParserBase extends FeedsParser {
  10. protected $modified_queries = array();
  11. protected $rawXML = array();
  12. protected $doc = NULL;
  13. protected $xpath = NULL;
  14. /**
  15. * Classes that use FeedsXPathParserBase must implement this.
  16. *
  17. * @param array $source_config
  18. * The configuration for the source.
  19. * @param FeedsFetcherResult $fetcher_result
  20. * A FeedsFetcherResult object.
  21. *
  22. * @return DOMDocument
  23. * The DOMDocument to perform XPath queries on.
  24. */
  25. abstract protected function setup($source_config, FeedsFetcherResult $fetcher_result);
  26. /**
  27. * Implements FeedsParser::parse().
  28. */
  29. public function parse(FeedsSource $source, FeedsFetcherResult $fetcher_result) {
  30. $source_config = $source->getConfigFor($this);
  31. if (empty($source_config)) {
  32. $source_config = $this->getConfig();
  33. }
  34. $this->doc = $this->setup($source_config, $fetcher_result);
  35. $parser_result = new FeedsParserResult();
  36. $mappings = $this->getOwnMappings();
  37. $this->rawXML = array_keys(array_filter($source_config['rawXML']));
  38. // Set link.
  39. $fetcher_config = $source->getConfigFor($source->importer->fetcher);
  40. $parser_result->link = $fetcher_config['source'];
  41. $this->xpath = new FeedsXPathParserDOMXPath($this->doc);
  42. $config = array();
  43. $config['debug'] = array_keys(array_filter($source_config['exp']['debug']));
  44. $config['errors'] = $source_config['exp']['errors'];
  45. $this->xpath->setConfig($config);
  46. $all_nodes = $this->xpath->namespacedQuery($source_config['context'], NULL, 'context');
  47. foreach ($all_nodes as $node) {
  48. $parsed_item = $variables = array();
  49. foreach ($source_config['sources'] as $source => $query) {
  50. // Variable substitution.
  51. $query = strtr($query, $variables);
  52. // Parse the item.
  53. $result = $this->parseSourceElement($query, $node, $source);
  54. if (isset($result)) {
  55. if (!is_array($result)) {
  56. $variables['$' . $mappings[$source]] = $result;
  57. }
  58. else {
  59. $variables['$' . $mappings[$source]] = '';
  60. }
  61. $parsed_item[$source] = $result;
  62. }
  63. }
  64. if (!empty($parsed_item)) {
  65. $parser_result->items[] = $parsed_item;
  66. }
  67. }
  68. unset($this->doc);
  69. unset($this->xpath);
  70. return $parser_result;
  71. }
  72. /**
  73. * Parses one item from the context array.
  74. *
  75. * @param $item
  76. * A SimpleXMLElement from the context array.
  77. *
  78. * @param $query
  79. * An XPath query.
  80. *
  81. * @param $source
  82. * The name of the source for this query.
  83. *
  84. * @return array
  85. * An array containing the results of the query.
  86. */
  87. protected function parseSourceElement($query, $context, $source) {
  88. if (empty($query)) {
  89. return;
  90. }
  91. $node_list = $this->xpath->namespacedQuery($query, $context, $source);
  92. /**
  93. * Iterate through the results of the XPath query. If this source is
  94. * configured to return raw xml, make it so.
  95. */
  96. if ($node_list instanceof DOMNodeList) {
  97. $results = array();
  98. if (in_array($source, $this->rawXML)) {
  99. foreach ($node_list as $node) {
  100. $results[] = $this->getRaw($node);
  101. }
  102. }
  103. else {
  104. foreach ($node_list as $node) {
  105. $results[] = $node->nodeValue;
  106. }
  107. }
  108. // Return single result if so.
  109. if (count($results) === 1) {
  110. return $results[0];
  111. }
  112. // Empty result returns NULL, that way we can check.
  113. elseif (empty($results)) {
  114. return;
  115. }
  116. else {
  117. return $results;
  118. }
  119. }
  120. // A value was returned directly from namespacedQuery().
  121. else {
  122. return $node_list;
  123. }
  124. }
  125. /**
  126. * Source form.
  127. */
  128. public function sourceForm($source_config) {
  129. $form = array();
  130. $importer = feeds_importer($this->id);
  131. $importer_config = $importer->getConfig();
  132. $mappings_ = $importer_config['processor']['config']['mappings'];
  133. if (empty($source_config)) {
  134. $source_config = $this->getConfig();
  135. }
  136. if (isset($source_config['allow_override']) &&
  137. !$source_config['allow_override'] &&
  138. empty($source_config['config'])) {
  139. return;
  140. }
  141. // Add extensions that might get importerd.
  142. $allowed_extensions = isset($importer_config['fetcher']['config']['allowed_extensions']) ? $importer_config['fetcher']['config']['allowed_extensions'] : FALSE;
  143. if ($allowed_extensions) {
  144. if (strpos($allowed_extensions, 'html') === FALSE) {
  145. $importer->fetcher->config['allowed_extensions'] .= ' html htm';
  146. }
  147. }
  148. $uniques = $mappings = array();
  149. foreach ($mappings_ as $mapping) {
  150. if (strpos($mapping['source'], 'xpathparser:') === 0) {
  151. $mappings[$mapping['source']] = $mapping['target'];
  152. if ($mapping['unique']) {
  153. $uniques[] = $mapping['target'];
  154. }
  155. }
  156. }
  157. $form['xpath'] = array(
  158. '#type' => 'fieldset',
  159. '#tree' => TRUE,
  160. '#title' => t('XPath Parser Settings'),
  161. '#collapsible' => TRUE,
  162. '#collapsed' => TRUE,
  163. );
  164. if (empty($mappings)) {
  165. // Detect if Feeds menu structure has changed. This will take a while to be
  166. // released, but since I run dev it needs to work.
  167. $feeds_menu = feeds_ui_menu();
  168. if (isset($feeds_menu['admin/structure/feeds/list'])) {
  169. $feeds_base = 'admin/structure/feeds/edit/';
  170. }
  171. else {
  172. $feeds_base = 'admin/structure/feeds/';
  173. }
  174. $form['xpath']['error_message']['#markup'] = '<div class="help">' . t('No XPath mappings are defined. Define mappings !link.', array('!link' => l(t('here'), $feeds_base . $this->id . '/mapping'))) . '</div><br />';
  175. return $form;
  176. }
  177. $form['xpath']['context'] = array(
  178. '#type' => 'textfield',
  179. '#title' => t('Context'),
  180. '#required' => TRUE,
  181. '#description' => t('This is the base query, all other queries will run in this context.'),
  182. '#default_value' => isset($source_config['context']) ? $source_config['context'] : '',
  183. '#maxlength' => 1024,
  184. '#size' => 80,
  185. );
  186. $form['xpath']['sources'] = array(
  187. '#type' => 'fieldset',
  188. '#tree' => TRUE,
  189. );
  190. if (!empty($uniques)) {
  191. $items = array(
  192. format_plural(count($uniques),
  193. t('Field <strong>!column</strong> is mandatory and considered unique: only one item per !column value will be created.',
  194. array('!column' => implode(', ', $uniques))),
  195. t('Fields <strong>!columns</strong> are mandatory and values in these columns are considered unique: only one entry per value in one of these columns will be created.',
  196. array('!columns' => implode(', ', $uniques)))),
  197. );
  198. $form['xpath']['sources']['help']['#markup'] = '<div class="help">' . theme('item_list', array('items' => $items)) . '</div>';
  199. }
  200. $variables = array();
  201. foreach ($mappings as $source => $target) {
  202. $form['xpath']['sources'][$source] = array(
  203. '#type' => 'textfield',
  204. '#title' => check_plain($target),
  205. '#description' => t('The XPath query to run.'),
  206. '#default_value' => isset($source_config['sources'][$source]) ? $source_config['sources'][$source] : '',
  207. '#maxlength' => 1024,
  208. '#size' => 80,
  209. );
  210. if (!empty($variables)) {
  211. $variable_text = format_plural(count($variables),
  212. t('The variable ' . implode(', ', $variables) . ' is available for replacement.'),
  213. t('The variables ' . implode(', ', $variables) . ' are available for replacement.')
  214. );
  215. $form['xpath']['sources'][$source]['#description'] .= '<br />' . $variable_text;
  216. }
  217. $variables[] = '$' . $target;
  218. }
  219. $form['xpath']['rawXML'] = array(
  220. '#type' => 'checkboxes',
  221. '#title' => t('Select the queries you would like to return raw XML or HTML'),
  222. '#options' => $mappings,
  223. '#default_value' => isset($source_config['rawXML']) ? $source_config['rawXML'] : array(),
  224. );
  225. $form['xpath']['exp'] = array(
  226. '#type' => 'fieldset',
  227. '#collapsible' => TRUE,
  228. '#collapsed' => TRUE,
  229. '#tree' => TRUE,
  230. '#title' => t('Debug Options'),
  231. );
  232. $form['xpath']['exp']['errors'] = array(
  233. '#type' => 'checkbox',
  234. '#title' => t('Show error messages.'),
  235. '#default_value' => isset($source_config['exp']['errors']) ? $source_config['exp']['errors'] : FALSE,
  236. );
  237. if (extension_loaded('tidy')) {
  238. $form['xpath']['exp']['tidy'] = array(
  239. '#type' => 'checkbox',
  240. '#title' => t('Use Tidy'),
  241. '#description' => t('The Tidy PHP extension has been detected.
  242. Select this to clean the markup before parsing.'),
  243. '#default_value' => isset($source_config['exp']['tidy']) ? $source_config['exp']['tidy'] : FALSE,
  244. );
  245. $form['xpath']['exp']['tidy_encoding'] = array(
  246. '#type' => 'textfield',
  247. '#title' => t('Tidy encoding'),
  248. '#description' => t('Set the encoding for tidy. See the !phpdocs for possible values.', array('!phpdocs' => l(t('PHP docs'), 'http://www.php.net/manual/en/tidy.parsestring.php/'))),
  249. '#default_value' => isset($source_config['exp']['tidy_encoding']) ? $source_config['exp']['tidy_encoding'] : 'UTF8',
  250. '#states' => array(
  251. 'visible' => array(
  252. ':input[name$="[tidy]"]' => array(
  253. 'checked' => TRUE,
  254. ),
  255. ),
  256. ),
  257. );
  258. }
  259. $form['xpath']['exp']['debug'] = array(
  260. '#type' => 'checkboxes',
  261. '#title' => t('Debug query'),
  262. '#options' => array_merge(array('context' => 'context'), $mappings),
  263. '#default_value' => isset($source_config['exp']['debug']) ? $source_config['exp']['debug'] : array(),
  264. );
  265. return $form;
  266. }
  267. /**
  268. * Override parent::configForm().
  269. */
  270. public function configForm(&$form_state) {
  271. $config = $this->getConfig();
  272. $config['config'] = TRUE;
  273. $form = $this->sourceForm($config);
  274. $form['xpath']['context']['#required'] = FALSE;
  275. $form['xpath']['#collapsed'] = FALSE;
  276. $form['xpath']['allow_override'] = array(
  277. '#type' => 'checkbox',
  278. '#title' => t('Allow source configuration override'),
  279. '#description' => t('This setting allows feed nodes to specify their own XPath values for the context and sources.'),
  280. '#default_value' => $config['allow_override'],
  281. );
  282. return $form;
  283. }
  284. /**
  285. * Define defaults.
  286. */
  287. public function sourceDefaults() {
  288. return array();
  289. }
  290. /**
  291. * Define defaults.
  292. */
  293. public function configDefaults() {
  294. return array(
  295. 'sources' => array(),
  296. 'rawXML' => array(),
  297. 'context' => '',
  298. 'exp' => array(
  299. 'errors' => FALSE,
  300. 'tidy' => FALSE,
  301. 'debug' => array(),
  302. 'tidy_encoding' => 'UTF8',
  303. ),
  304. 'allow_override' => TRUE,
  305. );
  306. }
  307. /**
  308. * Override parent::sourceFormValidate().
  309. *
  310. * If the values of this source are the same as the base config we set them to
  311. * blank to that the values will be inherited from the importer defaults.
  312. *
  313. * @param &$values
  314. * The values from the form to validate, passed by reference.
  315. */
  316. public function sourceFormValidate(&$values) {
  317. $config = $this->getConfig();
  318. $values = $values['xpath'];
  319. $allow_override = $config['allow_override'];
  320. unset($config['allow_override']);
  321. ksort($values);
  322. ksort($config);
  323. if ($values === $config || !$allow_override) {
  324. $values = array();
  325. return;
  326. }
  327. $this->configFormValidate($values);
  328. }
  329. /**
  330. * Override parent::sourceFormValidate().
  331. */
  332. public function configFormValidate(&$values) {
  333. $mappings = $this->getOwnMappings();
  334. // This tests if we're validating configForm or sourceForm.
  335. $config_form = FALSE;
  336. if (isset($values['xpath'])) {
  337. $values = $values['xpath'];
  338. $config_form = TRUE;
  339. }
  340. $class = get_class($this);
  341. $xml = new SimpleXMLElement('<?xml version="1.0" encoding="UTF-8"?>' . "\n<items></items>");
  342. $use_errors = libxml_use_internal_errors(TRUE);
  343. $values['context'] = trim($values['context']);
  344. if (!empty($values['context'])) {
  345. $result = $xml->xpath($values['context']);
  346. }
  347. $error = libxml_get_last_error();
  348. // Error code 1219 is undefined namespace prefix.
  349. // Our sample doc doesn't have any namespaces let alone the one they're
  350. // trying to use. Besides, if someone is trying to use a namespace in an
  351. // XPath query, they're probably right.
  352. if ($error && $error->code != 1219) {
  353. $element = 'feeds][' . $class . '][xpath][context';
  354. if ($config_form) {
  355. $element = 'xpath][context';
  356. }
  357. form_set_error($element, t('There was an error with the XPath selector: %error', array('%error' => $error->message)));
  358. libxml_clear_errors();
  359. }
  360. foreach ($values['sources'] as $key => &$query) {
  361. $query = trim($query);
  362. if (!empty($query)) {
  363. $result = $xml->xpath($query);
  364. $error = libxml_get_last_error();
  365. if ($error && $error->code != 1219) {
  366. $variable_present = FALSE;
  367. // Our variable substitution options can cause syntax errors, check
  368. // if we're doing that.
  369. if ($error->code == 1207) {
  370. foreach ($mappings as $target) {
  371. if (strpos($query, '$' . $target) !== FALSE) {
  372. $variable_present = TRUE;
  373. }
  374. }
  375. }
  376. if (!$variable_present) {
  377. $element = 'feeds][' . $class . '][xpath][sources][' . $key;
  378. if ($config_form) {
  379. $element = 'xpath][sources][' . $key;
  380. }
  381. form_set_error($element, t('There was an error with the XPath selector: %error', array('%error' => $error->message)));
  382. libxml_clear_errors();
  383. }
  384. }
  385. }
  386. }
  387. libxml_use_internal_errors($use_errors);
  388. }
  389. /**
  390. * Override parent::getMappingSources().
  391. */
  392. public function getMappingSources() {
  393. $mappings = $this->filterMappings(feeds_importer($this->id)->processor->config['mappings']);
  394. $next = 0;
  395. if (!empty($mappings)) {
  396. $keys = array_keys($mappings);
  397. $last_mapping = end($keys);
  398. $next = explode(':', $last_mapping);
  399. $next = $next[1] + 1;
  400. }
  401. return array(
  402. 'xpathparser:' . $next => array(
  403. 'name' => t('XPath Expression'),
  404. 'description' => t('Allows you to configure an XPath expression that will populate this field.'),
  405. ),
  406. ) + parent::getMappingSources();
  407. }
  408. /**
  409. * Get the mappings that belong to us i.e. mappings that begin with
  410. * "xpathparser:".
  411. *
  412. * @return array
  413. * An array of mappings keyed source => target.
  414. */
  415. protected function getOwnMappings() {
  416. $importer_config = feeds_importer($this->id)->getConfig();
  417. return $this->filterMappings($importer_config['processor']['config']['mappings']);
  418. }
  419. /**
  420. * Filters mappings, returning the ones that belong to us.
  421. *
  422. * @param array $mappings
  423. * A mapping array from a processor.
  424. *
  425. * @return array
  426. * An array of mappings keyed source => target.
  427. */
  428. protected function filterMappings($mappings) {
  429. $our_mappings = array();
  430. foreach ($mappings as $mapping) {
  431. if (strpos($mapping['source'], 'xpathparser:') === 0) {
  432. $our_mappings[$mapping['source']] = $mapping['target'];
  433. }
  434. }
  435. return $our_mappings;
  436. }
  437. /**
  438. * Start custom error handling.
  439. *
  440. * @return bool
  441. * The previous value of use_errors.
  442. */
  443. protected function errorStart() {
  444. return libxml_use_internal_errors(TRUE);
  445. }
  446. /**
  447. * Stop custom error handling.
  448. *
  449. * @param bool $use
  450. * The previous value of use_errors.
  451. * @param bool $print
  452. * (Optional) Whether to print errors to the screen. Defaults to TRUE.
  453. */
  454. protected function errorStop($use, $print = TRUE) {
  455. if ($print) {
  456. foreach (libxml_get_errors() as $error) {
  457. switch ($error->level) {
  458. case LIBXML_ERR_WARNING:
  459. case LIBXML_ERR_ERROR:
  460. $type = 'warning';
  461. break;
  462. case LIBXML_ERR_FATAL:
  463. $type = 'error';
  464. break;
  465. }
  466. $message = t('%error on line %num. Error code: %code', array('%error' => trim($error->message), '%num' => $error->line, '%code' => $error->code));
  467. drupal_set_message($message, $type, FALSE);
  468. }
  469. }
  470. libxml_clear_errors();
  471. libxml_use_internal_errors($use);
  472. }
  473. abstract protected function getRaw(DOMNode $node);
  474. }