xmlsitemap.generate.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. <?php
  2. /**
  3. * @file
  4. * Sitemap generation and rebuilding functions for the xmlsitemap module.
  5. *
  6. * @ingroup xmlsitemap
  7. */
  8. /**
  9. * Given an internal Drupal path, return the alias for the path.
  10. *
  11. * This is similar to drupal_get_path_alias(), but designed to fetch all alises
  12. * at once so that only one database query is executed instead of several or
  13. * possibly thousands during sitemap generation.
  14. *
  15. * @param string $path
  16. * An internal Drupal path.
  17. * @param string $language
  18. * A language code to use when looking up the paths.
  19. */
  20. function xmlsitemap_get_path_alias($path, $language) {
  21. static $aliases;
  22. static $last_language;
  23. if (!isset($aliases)) {
  24. $aliases[LANGUAGE_NONE] = db_query("SELECT source, alias FROM {url_alias} WHERE language = :language ORDER BY pid", array(':language' => LANGUAGE_NONE))->fetchAllKeyed();
  25. }
  26. if ($language != LANGUAGE_NONE && $last_language != $language) {
  27. unset($aliases[$last_language]);
  28. $aliases[$language] = db_query("SELECT source, alias FROM {url_alias} WHERE language = :language ORDER BY pid", array(':language' => $language))->fetchAllKeyed();
  29. $last_language = $language;
  30. }
  31. // We need to pass our path through hook_url_outbound_alter(). This fixes
  32. // clean URLs not working when they don't exist in the {url_alias} table and
  33. // are created with something like subpathauto.
  34. $normalized_path = $path;
  35. // hook_url_outbound_alter() expects defaults in url() options.
  36. $options = array(
  37. 'fragment' => '',
  38. 'query' => array(),
  39. 'absolute' => FALSE,
  40. 'alias' => FALSE,
  41. 'prefix' => '',
  42. 'external' => FALSE,
  43. );
  44. if ($language != LANGUAGE_NONE && isset($aliases[$language][$path])) {
  45. $normalized_path = $aliases[$language][$path];
  46. $options['alias'] = TRUE;
  47. }
  48. elseif (isset($aliases[LANGUAGE_NONE][$path])) {
  49. $normalized_path = $aliases[LANGUAGE_NONE][$path];
  50. $options['alias'] = TRUE;
  51. }
  52. $original_path = $normalized_path;
  53. drupal_alter('url_outbound', $normalized_path, $options, $original_path);
  54. return $normalized_path;
  55. }
  56. /**
  57. * Perform operations before rebuilding the sitemap.
  58. */
  59. function _xmlsitemap_regenerate_before() {
  60. // Attempt to increase the memory limit.
  61. _xmlsitemap_set_memory_limit();
  62. if (variable_get('xmlsitemap_developer_mode', 0)) {
  63. watchdog('xmlsitemap', 'Starting XML sitemap generation. Memory usage: @memory-peak.', array(
  64. '@memory-peak' => format_size(memory_get_peak_usage(TRUE)),
  65. ),
  66. WATCHDOG_DEBUG
  67. );
  68. }
  69. }
  70. /**
  71. * Get Memory Usage.
  72. */
  73. function _xmlsitemap_get_memory_usage($start = FALSE) {
  74. static $memory_start;
  75. $current = memory_get_peak_usage(TRUE);
  76. if (!isset($memory_start) || $start) {
  77. $memory_start = $current;
  78. }
  79. return $current - $memory_start;
  80. }
  81. /**
  82. * Calculate the optimal PHP memory limit for sitemap generation.
  83. *
  84. * This function just makes a guess. It does not take into account
  85. * the currently loaded modules.
  86. */
  87. function _xmlsitemap_get_optimal_memory_limit() {
  88. $optimal_limit = &drupal_static(__FUNCTION__);
  89. if (!isset($optimal_limit)) {
  90. // Set the base memory amount from the provided core constant.
  91. $optimal_limit = parse_size(DRUPAL_MINIMUM_PHP_MEMORY_LIMIT);
  92. // Add memory based on the chunk size.
  93. $optimal_limit += xmlsitemap_get_chunk_size() * 500;
  94. // Add memory for storing the url aliases.
  95. if (variable_get('xmlsitemap_prefetch_aliases', 1)) {
  96. $aliases = db_query("SELECT COUNT(pid) FROM {url_alias}")->fetchField();
  97. $optimal_limit += $aliases * 250;
  98. }
  99. }
  100. return $optimal_limit;
  101. }
  102. /**
  103. * Calculate the optimal memory level for sitemap generation.
  104. *
  105. * @param string $new_limit
  106. * An optional PHP memory limit in bytes. If not provided, the value of
  107. * _xmlsitemap_get_optimal_memory_limit() will be used.
  108. */
  109. function _xmlsitemap_set_memory_limit($new_limit = NULL) {
  110. $current_limit = @ini_get('memory_limit');
  111. if ($current_limit && $current_limit != -1) {
  112. if (!is_null($new_limit)) {
  113. $new_limit = _xmlsitemap_get_optimal_memory_limit();
  114. }
  115. if (parse_size($current_limit) < $new_limit) {
  116. return @ini_set('memory_limit', $new_limit);
  117. }
  118. }
  119. }
  120. /**
  121. * Generate one page (chunk) of the sitemap.
  122. *
  123. * @param object $sitemap
  124. * An unserialized data array for an XML sitemap.
  125. * @param string $page
  126. * An integer of the specific page of the sitemap to generate.
  127. */
  128. function xmlsitemap_generate_page(stdClass $sitemap, $page) {
  129. try {
  130. $writer = new XMLSitemapWriter($sitemap, $page);
  131. $writer->startDocument();
  132. $writer->generateXML();
  133. $writer->endDocument();
  134. }
  135. catch (Exception $e) {
  136. watchdog_exception('xmlsitemap', $e);
  137. throw $e;
  138. }
  139. return $writer->getSitemapElementCount();
  140. }
  141. /**
  142. * Generate chunk.
  143. */
  144. function xmlsitemap_generate_chunk(stdClass $sitemap, XMLSitemapWriter $writer, $chunk) {
  145. global $base_url;
  146. $output_elements = drupal_map_assoc(variable_get('xmlsitemap_output_elements', array(
  147. 'lastmod',
  148. 'changefreq',
  149. 'priority',
  150. )));
  151. $lastmod_format = variable_get('xmlsitemap_lastmod_format', XMLSITEMAP_LASTMOD_MEDIUM);
  152. $url_options = $sitemap->uri['options'];
  153. $url_options += array(
  154. 'absolute' => TRUE,
  155. 'base_url' => variable_get('xmlsitemap_base_url', $base_url),
  156. 'language' => language_default(),
  157. 'alias' => variable_get('xmlsitemap_prefetch_aliases', TRUE),
  158. );
  159. $last_url = '';
  160. $link_count = 0;
  161. $query = db_select('xmlsitemap', 'x');
  162. $query->fields('x', array(
  163. 'id',
  164. 'type',
  165. 'subtype',
  166. 'loc',
  167. 'lastmod',
  168. 'changefreq',
  169. 'changecount',
  170. 'priority',
  171. 'language',
  172. 'access',
  173. 'status',
  174. ));
  175. $query->condition('x.access', 1);
  176. $query->condition('x.status', 1);
  177. $query->orderBy('x.language', 'DESC');
  178. $query->orderBy('x.loc');
  179. $query->addTag('xmlsitemap_generate');
  180. $query->addMetaData('sitemap', $sitemap);
  181. $offset = max($chunk - 1, 0) * xmlsitemap_get_chunk_size();
  182. $limit = xmlsitemap_get_chunk_size();
  183. $query->range($offset, $limit);
  184. $links = $query->execute();
  185. while ($link = $links->fetchAssoc()) {
  186. $link['language'] = $link['language'] != LANGUAGE_NONE ? xmlsitemap_language_load($link['language']) : $url_options['language'];
  187. $parsed_url = drupal_parse_url($link['loc']);
  188. // Remove query or fragment.
  189. $link['loc'] = $parsed_url['path'];
  190. if ($url_options['alias']) {
  191. $link['loc'] = xmlsitemap_get_path_alias($link['loc'], $link['language']->language);
  192. }
  193. $link_options = array(
  194. 'language' => $link['language'],
  195. 'xmlsitemap_link' => $link,
  196. 'xmlsitemap_sitemap' => $sitemap,
  197. 'query' => $parsed_url['query'],
  198. 'fragment' => $parsed_url['fragment'],
  199. );
  200. // @todo Add a separate hook_xmlsitemap_link_url_alter() here?
  201. $link_url = url($link['loc'], $link_options + $url_options);
  202. // Skip this link if it was a duplicate of the last one.
  203. // @todo Figure out a way to do this before generation so we can report
  204. // back to the user about this.
  205. if ($link_url == $last_url) {
  206. continue;
  207. }
  208. else {
  209. $last_url = $link_url;
  210. // Keep track of the total number of links written.
  211. $link_count++;
  212. }
  213. $element = array();
  214. $element['loc'] = $link_url;
  215. if ($link['lastmod']) {
  216. if (!empty($output_elements['lastmod'])) {
  217. $element['lastmod'] = gmdate($lastmod_format, $link['lastmod']);
  218. }
  219. // If the link has a lastmod value, update the changefreq so that links
  220. // with a short changefreq but updated two years ago show decay.
  221. // We use abs() here just incase items were created on this same cron run
  222. // because lastmod would be greater than REQUEST_TIME.
  223. $link['changefreq'] = (abs(REQUEST_TIME - $link['lastmod']) + $link['changefreq']) / 2;
  224. }
  225. if (!empty($output_elements['changefreq']) && $link['changefreq']) {
  226. $element['changefreq'] = xmlsitemap_get_changefreq($link['changefreq']);
  227. }
  228. if (!empty($output_elements['priority']) && isset($link['priority']) && $link['priority'] != 0.5) {
  229. // Don't output the priority value for links that have 0.5 priority. This
  230. // is the default 'assumed' value if priority is not included as per the
  231. // sitemaps.org specification.
  232. $element['priority'] = number_format($link['priority'], 1);
  233. }
  234. // @todo Should this be moved to XMLSitemapWritier::writeSitemapElement()?
  235. drupal_alter('xmlsitemap_element', $element, $link, $sitemap);
  236. $writer->writeSitemapElement('url', $element);
  237. }
  238. return $link_count;
  239. }
  240. /**
  241. * Generate the index sitemap.
  242. *
  243. * @param object $sitemap
  244. * An unserialized data array for an XML sitemap.
  245. */
  246. function xmlsitemap_generate_index(stdClass $sitemap) {
  247. try {
  248. $writer = new XMLSitemapIndexWriter($sitemap);
  249. $writer->startDocument();
  250. $writer->generateXML();
  251. $writer->endDocument();
  252. }
  253. catch (Exception $e) {
  254. watchdog_exception('xmlsitemap', $e);
  255. throw $e;
  256. }
  257. return $writer->getSitemapElementCount();
  258. }
  259. /**
  260. * BATCH OPERATIONS -----------------------------------------------------------.
  261. *
  262. * Batch information callback for regenerating the sitemap files.
  263. *
  264. * @param array $smids
  265. * An optional array of XML sitemap IDs. If not provided, it will load all
  266. * existing XML sitemaps.
  267. */
  268. function xmlsitemap_regenerate_batch(array $smids = array()) {
  269. if (empty($smids)) {
  270. $smids = db_query("SELECT smid FROM {xmlsitemap_sitemap}")->fetchCol();
  271. }
  272. $batch = array(
  273. 'operations' => array(),
  274. 'finished' => 'xmlsitemap_regenerate_batch_finished',
  275. 'title' => t('Regenerating Sitemap'),
  276. 'file' => drupal_get_path('module', 'xmlsitemap') . '/xmlsitemap.generate.inc',
  277. );
  278. // Set the regenerate flag in case something fails during file generation.
  279. $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_regenerate_needed' => TRUE)));
  280. // @todo Get rid of this batch operation.
  281. $batch['operations'][] = array('_xmlsitemap_regenerate_before', array());
  282. // Generate all the sitemap pages for each context.
  283. foreach ($smids as $smid) {
  284. $batch['operations'][] = array('xmlsitemap_regenerate_batch_generate', array($smid));
  285. $batch['operations'][] = array('xmlsitemap_regenerate_batch_generate_index', array($smid));
  286. }
  287. // Clear the regeneration flag.
  288. $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_regenerate_needed' => FALSE)));
  289. return $batch;
  290. }
  291. /**
  292. * Batch callback; generate all pages of a sitemap.
  293. */
  294. function xmlsitemap_regenerate_batch_generate($smid, array &$context) {
  295. if (!isset($context['sandbox']['sitemap'])) {
  296. $context['sandbox']['sitemap'] = xmlsitemap_sitemap_load($smid);
  297. $context['sandbox']['sitemap']->chunks = 1;
  298. $context['sandbox']['sitemap']->links = 0;
  299. $context['sandbox']['max'] = XMLSITEMAP_MAX_SITEMAP_LINKS;
  300. // Clear the cache directory for this sitemap before generating any files.
  301. xmlsitemap_check_directory($context['sandbox']['sitemap']);
  302. xmlsitemap_clear_directory($context['sandbox']['sitemap']);
  303. }
  304. $sitemap = &$context['sandbox']['sitemap'];
  305. $links = xmlsitemap_generate_page($sitemap, $sitemap->chunks);
  306. $context['message'] = t('Now generating %sitemap-url.', array('%sitemap-url' => url('sitemap.xml', $sitemap->uri['options'] + array('query' => array('page' => $sitemap->chunks)))));
  307. if ($links) {
  308. $sitemap->links += $links;
  309. $sitemap->chunks++;
  310. }
  311. else {
  312. // Cleanup the 'extra' empty file.
  313. $file = xmlsitemap_sitemap_get_file($sitemap, $sitemap->chunks);
  314. if (file_exists($file) && $sitemap->chunks > 1) {
  315. file_unmanaged_delete($file);
  316. }
  317. $sitemap->chunks--;
  318. // Save the updated chunks and links values.
  319. $context['sandbox']['max'] = $sitemap->chunks;
  320. $sitemap->updated = REQUEST_TIME;
  321. xmlsitemap_sitemap_get_max_filesize($sitemap);
  322. xmlsitemap_sitemap_save($sitemap);
  323. }
  324. if ($sitemap->chunks != $context['sandbox']['max']) {
  325. $context['finished'] = $sitemap->chunks / $context['sandbox']['max'];
  326. }
  327. }
  328. /**
  329. * Batch callback; generate the index page of a sitemap.
  330. */
  331. function xmlsitemap_regenerate_batch_generate_index($smid, array &$context) {
  332. $sitemap = xmlsitemap_sitemap_load($smid);
  333. if ($sitemap->chunks > 1) {
  334. xmlsitemap_generate_index($sitemap);
  335. $context['message'] = t('Now generating sitemap index %sitemap-url.', array('%sitemap-url' => url('sitemap.xml', $sitemap->uri['options'])));
  336. }
  337. }
  338. /**
  339. * Batch callback; sitemap regeneration finished.
  340. */
  341. function xmlsitemap_regenerate_batch_finished($success, $results, $operations, $elapsed) {
  342. if ($success && !variable_get('xmlsitemap_regenerate_needed', FALSE)) {
  343. variable_set('xmlsitemap_generated_last', REQUEST_TIME);
  344. // drupal_set_message(t('The sitemaps were regenerated.'));
  345. // Show a watchdog message that the sitemap was regenerated.
  346. watchdog('xmlsitemap',
  347. 'Finished XML sitemap generation in @elapsed. Memory usage: @memory-peak.',
  348. array(
  349. '@elapsed' => $elapsed,
  350. '@memory-peak' => format_size(memory_get_peak_usage(TRUE)),
  351. ),
  352. WATCHDOG_NOTICE
  353. );
  354. module_invoke_all('xmlsitemap_regenerate_finished');
  355. }
  356. else {
  357. drupal_set_message(t('The sitemaps were not successfully regenerated.'), 'error');
  358. }
  359. }
  360. /**
  361. * Batch information callback for rebuilding the sitemap data.
  362. */
  363. function xmlsitemap_rebuild_batch(array $entities, $save_custom = FALSE) {
  364. $batch = array(
  365. 'operations' => array(),
  366. 'finished' => 'xmlsitemap_rebuild_batch_finished',
  367. 'title' => t('Rebuilding Sitemap'),
  368. 'file' => drupal_get_path('module', 'xmlsitemap') . '/xmlsitemap.generate.inc',
  369. );
  370. // Set the rebuild flag in case something fails during the rebuild.
  371. $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_rebuild_needed' => TRUE)));
  372. // Purge any links first.
  373. $batch['operations'][] = array('xmlsitemap_rebuild_batch_clear',
  374. array($entities, (bool) $save_custom),
  375. );
  376. // Fetch all the sitemap links and save them to the {xmlsitemap} table.
  377. foreach ($entities as $entity) {
  378. $info = xmlsitemap_get_link_info($entity);
  379. $batch['operations'][] = array($info['xmlsitemap']['rebuild callback'], array($entity));
  380. }
  381. // Clear the rebuild flag.
  382. $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_rebuild_needed' => FALSE)));
  383. // Add the regeneration batch.
  384. $regenerate_batch = xmlsitemap_regenerate_batch();
  385. $batch['operations'] = array_merge($batch['operations'], $regenerate_batch['operations']);
  386. return $batch;
  387. }
  388. /**
  389. * Batch callback; set an array of variables and their values.
  390. */
  391. function xmlsitemap_batch_variable_set(array $variables) {
  392. foreach ($variables as $variable => $value) {
  393. variable_set($variable, $value);
  394. }
  395. }
  396. /**
  397. * Batch callback; clear sitemap links for entites.
  398. */
  399. function xmlsitemap_rebuild_batch_clear(array $entities, $save_custom, &$context) {
  400. if (!empty($entities)) {
  401. xmlsitemap_rebuild_clear($entities, $save_custom);
  402. }
  403. $context['message'] = t('Purging links.');
  404. }
  405. /**
  406. * Batch callback; fetch and add the sitemap links for a specific entity.
  407. */
  408. function xmlsitemap_rebuild_batch_fetch($entity, &$context) {
  409. if (!isset($context['sandbox']['info'])) {
  410. $context['sandbox']['info'] = xmlsitemap_get_link_info($entity);
  411. $context['sandbox']['progress'] = 0;
  412. $context['sandbox']['last_id'] = 0;
  413. }
  414. $info = $context['sandbox']['info'];
  415. $query = new EntityFieldQuery();
  416. $query->entityCondition('entity_type', $entity);
  417. $query->entityCondition('entity_id', $context['sandbox']['last_id'], '>');
  418. $query->addTag('xmlsitemap_link_bundle_access');
  419. $query->addTag('xmlsitemap_rebuild');
  420. $query->addMetaData('entity', $entity);
  421. $query->addMetaData('entity_info', $info);
  422. if ($types = xmlsitemap_get_link_type_enabled_bundles($entity)) {
  423. $query->entityCondition('bundle', $types, 'IN');
  424. }
  425. else {
  426. // If no enabled bundle types, skip everything else.
  427. return;
  428. }
  429. if (!isset($context['sandbox']['max'])) {
  430. $count_query = clone $query;
  431. $count_query->count();
  432. $context['sandbox']['max'] = $count_query->execute();
  433. if (!$context['sandbox']['max']) {
  434. // If there are no items to process, skip everything else.
  435. return;
  436. }
  437. }
  438. // PostgreSQL cannot have the ORDERED BY in the count query.
  439. $query->entityOrderBy('entity_id');
  440. $limit = 20;
  441. $query->range(0, $limit);
  442. $result = $query->execute();
  443. $ids = array_keys($result[$entity]);
  444. $info['xmlsitemap']['process callback']($ids);
  445. $context['sandbox']['last_id'] = end($ids);
  446. $context['sandbox']['progress'] += count($ids);
  447. $context['message'] = t('Now processing %entity @last_id (@progress of @count).', array(
  448. '%entity' => $entity,
  449. '@last_id' => $context['sandbox']['last_id'],
  450. '@progress' => $context['sandbox']['progress'],
  451. '@count' => $context['sandbox']['max'],
  452. ));
  453. if ($context['sandbox']['progress'] >= $context['sandbox']['max']) {
  454. $context['finished'] = 1;
  455. }
  456. else {
  457. $context['finished'] = $context['sandbox']['progress'] / $context['sandbox']['max'];
  458. }
  459. }
  460. /**
  461. * Batch callback; sitemap rebuild finished.
  462. */
  463. function xmlsitemap_rebuild_batch_finished($success, $results, $operations, $elapsed) {
  464. if ($success && !variable_get('xmlsitemap_rebuild_needed', FALSE)) {
  465. drupal_set_message(t('The sitemap links were rebuilt.'));
  466. }
  467. else {
  468. drupal_set_message(t('The sitemap links were not successfully rebuilt.'), 'error');
  469. }
  470. }
  471. /**
  472. * Get Rebuildable link types.
  473. */
  474. function xmlsitemap_get_rebuildable_link_types() {
  475. $rebuild_types = array();
  476. $entities = xmlsitemap_get_link_info();
  477. foreach ($entities as $entity => $info) {
  478. if (empty($info['xmlsitemap']['rebuild callback'])) {
  479. // If the entity is missing a rebuild callback, skip.
  480. continue;
  481. }
  482. if (!empty($info['entity keys']['bundle']) && !xmlsitemap_get_link_type_enabled_bundles($entity)) {
  483. // If the entity has bundles, but no enabled bundles, skip since
  484. // rebuilding wouldn't get any links.
  485. continue;
  486. }
  487. else {
  488. $rebuild_types[] = $entity;
  489. }
  490. }
  491. return $rebuild_types;
  492. }
  493. /**
  494. * Clear all sitemap links for given entity types.
  495. *
  496. * @param array $types
  497. * An array of link types.
  498. * @param bool $save_custom
  499. * A boolean if links with status or priority overridden should not be
  500. * removed (and hence overridden values not lost).
  501. *
  502. * @return int
  503. * The number of deleted links.
  504. */
  505. function xmlsitemap_rebuild_clear(array $types, $save_custom) {
  506. // Let other modules respond to the rebuild clearing.
  507. module_invoke_all('xmlsitemap_rebuild_clear', $types, $save_custom);
  508. $query = db_delete('xmlsitemap');
  509. $query->condition('type', $types);
  510. // If we want to save the custom data, make sure to exclude any links
  511. // that are not using default inclusion or priority.
  512. if ($save_custom) {
  513. $query->condition('status_override', 0);
  514. $query->condition('priority_override', 0);
  515. }
  516. return $query->execute();
  517. }