xmlsitemap.generate.inc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. <?php
  2. /**
  3. * @file
  4. * Sitemap generation and rebuilding functions for the xmlsitemap module.
  5. *
  6. * @ingroup xmlsitemap
  7. */
  8. /**
  9. * Given an internal Drupal path, return the alias for the path.
  10. *
  11. * This is similar to drupal_get_path_alias(), but designed to fetch all alises
  12. * at once so that only one database query is executed instead of several or
  13. * possibly thousands during sitemap generation.
  14. *
  15. * @param $path
  16. * An internal Drupal path.
  17. * @param $language
  18. * A language code to use when looking up the paths.
  19. */
  20. function xmlsitemap_get_path_alias($path, $language) {
  21. static $aliases;
  22. static $last_language;
  23. if (!isset($aliases)) {
  24. $aliases[LANGUAGE_NONE] = db_query("SELECT source, alias FROM {url_alias} WHERE language = :language ORDER BY pid", array(':language' => LANGUAGE_NONE))->fetchAllKeyed();
  25. }
  26. if ($language != LANGUAGE_NONE && $last_language != $language) {
  27. unset($aliases[$last_language]);
  28. $aliases[$language] = db_query("SELECT source, alias FROM {url_alias} WHERE language = :language ORDER BY pid", array(':language' => $language))->fetchAllKeyed();
  29. $last_language = $language;
  30. }
  31. // We need to pass our path through hook_url_outbound_alter(). This fixes
  32. // clean URLs not working when they don't exist in the {url_alias} table and
  33. // are created with something like subpathauto.
  34. $normalized_path = $path;
  35. // hook_url_outbound_alter() expects defaults in url() options.
  36. $options = array(
  37. 'fragment' => '',
  38. 'query' => array(),
  39. 'absolute' => FALSE,
  40. 'alias' => FALSE,
  41. 'prefix' => '',
  42. 'external' => FALSE,
  43. );
  44. if ($language != LANGUAGE_NONE && isset($aliases[$language][$path])) {
  45. $normalized_path = $aliases[$language][$path];
  46. $options['alias'] = TRUE;
  47. }
  48. elseif (isset($aliases[LANGUAGE_NONE][$path])) {
  49. $normalized_path = $aliases[LANGUAGE_NONE][$path];
  50. $options['alias'] = TRUE;
  51. }
  52. $original_path = $normalized_path;
  53. drupal_alter('url_outbound', $normalized_path, $options, $original_path);
  54. return $normalized_path;
  55. }
  56. /**
  57. * Perform operations before rebuilding the sitemap.
  58. */
  59. function _xmlsitemap_regenerate_before() {
  60. // Attempt to increase the memory limit.
  61. _xmlsitemap_set_memory_limit();
  62. if (variable_get('xmlsitemap_developer_mode', 0)) {
  63. watchdog('xmlsitemap', 'Starting XML sitemap generation. Memory usage: @memory-peak.', array(
  64. '@memory-peak' => format_size(memory_get_peak_usage(TRUE)),
  65. ),
  66. WATCHDOG_DEBUG
  67. );
  68. }
  69. }
  70. function _xmlsitemap_get_memory_usage($start = FALSE) {
  71. static $memory_start;
  72. $current = memory_get_peak_usage(TRUE);
  73. if (!isset($memory_start) || $start) {
  74. $memory_start = $current;
  75. }
  76. return $current - $memory_start;
  77. }
  78. /**
  79. * Calculate the optimal PHP memory limit for sitemap generation.
  80. *
  81. * This function just makes a guess. It does not take into account
  82. * the currently loaded modules.
  83. */
  84. function _xmlsitemap_get_optimal_memory_limit() {
  85. $optimal_limit = &drupal_static(__FUNCTION__);
  86. if (!isset($optimal_limit)) {
  87. // Set the base memory amount from the provided core constant.
  88. $optimal_limit = parse_size(DRUPAL_MINIMUM_PHP_MEMORY_LIMIT);
  89. // Add memory based on the chunk size.
  90. $optimal_limit += xmlsitemap_get_chunk_size() * 500;
  91. // Add memory for storing the url aliases.
  92. if (variable_get('xmlsitemap_prefetch_aliases', 1)) {
  93. $aliases = db_query("SELECT COUNT(pid) FROM {url_alias}")->fetchField();
  94. $optimal_limit += $aliases * 250;
  95. }
  96. }
  97. return $optimal_limit;
  98. }
  99. /**
  100. * Calculate the optimal memory level for sitemap generation.
  101. *
  102. * @param $new_limit
  103. * An optional PHP memory limit in bytes. If not provided, the value of
  104. * _xmlsitemap_get_optimal_memory_limit() will be used.
  105. */
  106. function _xmlsitemap_set_memory_limit($new_limit = NULL) {
  107. $current_limit = @ini_get('memory_limit');
  108. if ($current_limit && $current_limit != -1) {
  109. if (!is_null($new_limit)) {
  110. $new_limit = _xmlsitemap_get_optimal_memory_limit();
  111. }
  112. if (parse_size($current_limit) < $new_limit) {
  113. return @ini_set('memory_limit', $new_limit);
  114. }
  115. }
  116. }
  117. /**
  118. * Generate one page (chunk) of the sitemap.
  119. *
  120. * @param $sitemap
  121. * An unserialized data array for an XML sitemap.
  122. * @param $page
  123. * An integer of the specific page of the sitemap to generate.
  124. */
  125. function xmlsitemap_generate_page(stdClass $sitemap, $page) {
  126. try {
  127. $writer = new XMLSitemapWriter($sitemap, $page);
  128. $writer->startDocument();
  129. $writer->generateXML();
  130. $writer->endDocument();
  131. }
  132. catch (Exception $e) {
  133. watchdog_exception('xmlsitemap', $e);
  134. throw $e;
  135. return FALSE;
  136. }
  137. return $writer->getSitemapElementCount();
  138. }
  139. function xmlsitemap_generate_chunk(stdClass $sitemap, XMLSitemapWriter $writer, $chunk) {
  140. $output_elements = drupal_map_assoc(variable_get('xmlsitemap_output_elements', array('lastmod', 'changefreq', 'priority')));
  141. $lastmod_format = variable_get('xmlsitemap_lastmod_format', XMLSITEMAP_LASTMOD_MEDIUM);
  142. $url_options = $sitemap->uri['options'];
  143. $url_options += array(
  144. 'absolute' => TRUE,
  145. 'base_url' => variable_get('xmlsitemap_base_url', $GLOBALS['base_url']),
  146. 'language' => language_default(),
  147. 'alias' => variable_get('xmlsitemap_prefetch_aliases', TRUE),
  148. );
  149. $last_url = '';
  150. $link_count = 0;
  151. $query = db_select('xmlsitemap', 'x');
  152. $query->fields('x', array('id', 'type', 'subtype', 'loc', 'lastmod', 'changefreq', 'changecount', 'priority', 'language', 'access', 'status'));
  153. $query->condition('x.access', 1);
  154. $query->condition('x.status', 1);
  155. $query->orderBy('x.language', 'DESC');
  156. $query->orderBy('x.loc');
  157. $query->addTag('xmlsitemap_generate');
  158. $query->addMetaData('sitemap', $sitemap);
  159. $offset = max($chunk - 1, 0) * xmlsitemap_get_chunk_size();
  160. $limit = xmlsitemap_get_chunk_size();
  161. $query->range($offset, $limit);
  162. $links = $query->execute();
  163. while ($link = $links->fetchAssoc()) {
  164. $link['language'] = $link['language'] != LANGUAGE_NONE ? xmlsitemap_language_load($link['language']) : $url_options['language'];
  165. if ($url_options['alias']) {
  166. $link['loc'] = xmlsitemap_get_path_alias($link['loc'], $link['language']->language);
  167. }
  168. $link_options = array(
  169. 'language' => $link['language'],
  170. 'xmlsitemap_link' => $link,
  171. 'xmlsitemap_sitemap' => $sitemap,
  172. );
  173. // @todo Add a separate hook_xmlsitemap_link_url_alter() here?
  174. $link_url = url($link['loc'], $link_options + $url_options);
  175. // Skip this link if it was a duplicate of the last one.
  176. // @todo Figure out a way to do this before generation so we can report
  177. // back to the user about this.
  178. if ($link_url == $last_url) {
  179. continue;
  180. }
  181. else {
  182. $last_url = $link_url;
  183. // Keep track of the total number of links written.
  184. $link_count++;
  185. }
  186. $element = array();
  187. $element['loc'] = $link_url;
  188. if ($link['lastmod']) {
  189. if (!empty($output_elements['lastmod'])) {
  190. $element['lastmod'] = gmdate($lastmod_format, $link['lastmod']);
  191. }
  192. // If the link has a lastmod value, update the changefreq so that links
  193. // with a short changefreq but updated two years ago show decay.
  194. // We use abs() here just incase items were created on this same cron run
  195. // because lastmod would be greater than REQUEST_TIME.
  196. $link['changefreq'] = (abs(REQUEST_TIME - $link['lastmod']) + $link['changefreq']) / 2;
  197. }
  198. if (!empty($output_elements['changefreq']) && $link['changefreq']) {
  199. $element['changefreq'] = xmlsitemap_get_changefreq($link['changefreq']);
  200. }
  201. if (!empty($output_elements['priority']) && isset($link['priority']) && $link['priority'] != 0.5) {
  202. // Don't output the priority value for links that have 0.5 priority. This
  203. // is the default 'assumed' value if priority is not included as per the
  204. // sitemaps.org specification.
  205. $element['priority'] = number_format($link['priority'], 1);
  206. }
  207. // @todo Should this be moved to XMLSitemapWritier::writeSitemapElement()?
  208. drupal_alter('xmlsitemap_element', $element, $link, $sitemap);
  209. $writer->writeSitemapElement('url', $element);
  210. }
  211. return $link_count;
  212. }
  213. /**
  214. * Generate the index sitemap.
  215. *
  216. * @param $sitemap
  217. * An unserialized data array for an XML sitemap.
  218. */
  219. function xmlsitemap_generate_index(stdClass $sitemap) {
  220. try {
  221. $writer = new XMLSitemapIndexWriter($sitemap);
  222. $writer->startDocument();
  223. $writer->generateXML();
  224. $writer->endDocument();
  225. }
  226. catch (Exception $e) {
  227. watchdog_exception('xmlsitemap', $e);
  228. throw $e;
  229. return FALSE;
  230. }
  231. return $writer->getSitemapElementCount();
  232. }
  233. // BATCH OPERATIONS ------------------------------------------------------------
  234. /**
  235. * Batch information callback for regenerating the sitemap files.
  236. *
  237. * @param $smids
  238. * An optional array of XML sitemap IDs. If not provided, it will load all
  239. * existing XML sitemaps.
  240. */
  241. function xmlsitemap_regenerate_batch(array $smids = array()) {
  242. if (empty($smids)) {
  243. $smids = db_query("SELECT smid FROM {xmlsitemap_sitemap}")->fetchCol();
  244. }
  245. //$t = get_t();
  246. $batch = array(
  247. 'operations' => array(),
  248. //'error_message' => $t('An error has occurred.'),
  249. 'finished' => 'xmlsitemap_regenerate_batch_finished',
  250. 'title' => t('Regenerating Sitemap'),
  251. 'file' => drupal_get_path('module', 'xmlsitemap') . '/xmlsitemap.generate.inc',
  252. );
  253. // Set the regenerate flag in case something fails during file generation.
  254. $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_regenerate_needed' => TRUE)));
  255. // @todo Get rid of this batch operation.
  256. $batch['operations'][] = array('_xmlsitemap_regenerate_before', array());
  257. // Generate all the sitemap pages for each context.
  258. foreach ($smids as $smid) {
  259. $batch['operations'][] = array('xmlsitemap_regenerate_batch_generate', array($smid));
  260. $batch['operations'][] = array('xmlsitemap_regenerate_batch_generate_index', array($smid));
  261. }
  262. // Clear the regeneration flag.
  263. $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_regenerate_needed' => FALSE)));
  264. return $batch;
  265. }
  266. /**
  267. * Batch callback; generate all pages of a sitemap.
  268. */
  269. function xmlsitemap_regenerate_batch_generate($smid, array &$context) {
  270. if (!isset($context['sandbox']['sitemap'])) {
  271. $context['sandbox']['sitemap'] = xmlsitemap_sitemap_load($smid);
  272. $context['sandbox']['sitemap']->chunks = 1;
  273. $context['sandbox']['sitemap']->links = 0;
  274. $context['sandbox']['max'] = XMLSITEMAP_MAX_SITEMAP_LINKS;
  275. // Clear the cache directory for this sitemap before generating any files.
  276. xmlsitemap_check_directory($context['sandbox']['sitemap']);
  277. xmlsitemap_clear_directory($context['sandbox']['sitemap']);
  278. }
  279. $sitemap = &$context['sandbox']['sitemap'];
  280. $links = xmlsitemap_generate_page($sitemap, $sitemap->chunks);
  281. $context['message'] = t('Now generating %sitemap-url.', array('%sitemap-url' => url('sitemap.xml', $sitemap->uri['options'] + array('query' => array('page' => $sitemap->chunks)))));
  282. if ($links) {
  283. $sitemap->links += $links;
  284. $sitemap->chunks++;
  285. }
  286. else {
  287. // Cleanup the 'extra' empty file.
  288. $file = xmlsitemap_sitemap_get_file($sitemap, $sitemap->chunks);
  289. if (file_exists($file) && $sitemap->chunks > 1) {
  290. file_unmanaged_delete($file);
  291. }
  292. $sitemap->chunks--;
  293. // Save the updated chunks and links values.
  294. $context['sandbox']['max'] = $sitemap->chunks;
  295. $sitemap->updated = REQUEST_TIME;
  296. xmlsitemap_sitemap_get_max_filesize($sitemap);
  297. xmlsitemap_sitemap_save($sitemap);
  298. }
  299. if ($sitemap->chunks != $context['sandbox']['max']) {
  300. $context['finished'] = $sitemap->chunks / $context['sandbox']['max'];
  301. }
  302. }
  303. /**
  304. * Batch callback; generate the index page of a sitemap.
  305. */
  306. function xmlsitemap_regenerate_batch_generate_index($smid, array &$context) {
  307. $sitemap = xmlsitemap_sitemap_load($smid);
  308. if ($sitemap->chunks > 1) {
  309. xmlsitemap_generate_index($sitemap);
  310. $context['message'] = t('Now generating sitemap index %sitemap-url.', array('%sitemap-url' => url('sitemap.xml', $sitemap->uri['options'])));
  311. }
  312. }
  313. /**
  314. * Batch callback; sitemap regeneration finished.
  315. */
  316. function xmlsitemap_regenerate_batch_finished($success, $results, $operations, $elapsed) {
  317. if ($success && !variable_get('xmlsitemap_regenerate_needed', FALSE)) {
  318. variable_set('xmlsitemap_generated_last', REQUEST_TIME);
  319. //drupal_set_message(t('The sitemaps were regenerated.'));
  320. // Show a watchdog message that the sitemap was regenerated.
  321. watchdog('xmlsitemap',
  322. 'Finished XML sitemap generation in @elapsed. Memory usage: @memory-peak.',
  323. array(
  324. '@elapsed' => $elapsed,
  325. '@memory-peak' => format_size(memory_get_peak_usage(TRUE)),
  326. ),
  327. WATCHDOG_NOTICE
  328. );
  329. }
  330. else {
  331. drupal_set_message(t('The sitemaps were not successfully regenerated.'), 'error');
  332. }
  333. }
  334. /**
  335. * Batch information callback for rebuilding the sitemap data.
  336. */
  337. function xmlsitemap_rebuild_batch(array $entities, $save_custom = FALSE) {
  338. $batch = array(
  339. 'operations' => array(),
  340. 'finished' => 'xmlsitemap_rebuild_batch_finished',
  341. 'title' => t('Rebuilding Sitemap'),
  342. 'file' => drupal_get_path('module', 'xmlsitemap') . '/xmlsitemap.generate.inc',
  343. );
  344. // Set the rebuild flag in case something fails during the rebuild.
  345. $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_rebuild_needed' => TRUE)));
  346. // Purge any links first.
  347. $batch['operations'][] = array('xmlsitemap_rebuild_batch_clear', array($entities, (bool) $save_custom));
  348. // Fetch all the sitemap links and save them to the {xmlsitemap} table.
  349. foreach ($entities as $entity) {
  350. $info = xmlsitemap_get_link_info($entity);
  351. $batch['operations'][] = array($info['xmlsitemap']['rebuild callback'], array($entity));
  352. }
  353. // Clear the rebuild flag.
  354. $batch['operations'][] = array('xmlsitemap_batch_variable_set', array(array('xmlsitemap_rebuild_needed' => FALSE)));
  355. // Add the regeneration batch.
  356. $regenerate_batch = xmlsitemap_regenerate_batch();
  357. $batch['operations'] = array_merge($batch['operations'], $regenerate_batch['operations']);
  358. return $batch;
  359. }
  360. /**
  361. * Batch callback; set an array of variables and their values.
  362. */
  363. function xmlsitemap_batch_variable_set(array $variables) {
  364. foreach ($variables as $variable => $value) {
  365. variable_set($variable, $value);
  366. }
  367. }
  368. /**
  369. * Batch callback; clear sitemap links for entites.
  370. */
  371. function xmlsitemap_rebuild_batch_clear(array $entities, $save_custom, &$context) {
  372. if (!empty($entities)) {
  373. xmlsitemap_rebuild_clear($entities, $save_custom);
  374. }
  375. $context['message'] = t('Purging links.');
  376. }
  377. /**
  378. * Batch callback; fetch and add the sitemap links for a specific entity.
  379. */
  380. function xmlsitemap_rebuild_batch_fetch($entity, &$context) {
  381. if (!isset($context['sandbox']['info'])) {
  382. $context['sandbox']['info'] = xmlsitemap_get_link_info($entity);
  383. $context['sandbox']['progress'] = 0;
  384. $context['sandbox']['last_id'] = 0;
  385. }
  386. $info = $context['sandbox']['info'];
  387. $query = new EntityFieldQuery();
  388. $query->entityCondition('entity_type', $entity);
  389. $query->entityCondition('entity_id', $context['sandbox']['last_id'], '>');
  390. $query->addTag('xmlsitemap_link_bundle_access');
  391. $query->addTag('xmlsitemap_rebuild');
  392. $query->addMetaData('entity', $entity);
  393. $query->addMetaData('entity_info', $info);
  394. if ($types = xmlsitemap_get_link_type_enabled_bundles($entity)) {
  395. $query->entityCondition('bundle', $types, 'IN');
  396. }
  397. else {
  398. // If no enabled bundle types, skip everything else.
  399. return;
  400. }
  401. if (!isset($context['sandbox']['max'])) {
  402. $count_query = clone $query;
  403. $count_query->count();
  404. $context['sandbox']['max'] = $count_query->execute();
  405. if (!$context['sandbox']['max']) {
  406. // If there are no items to process, skip everything else.
  407. return;
  408. }
  409. }
  410. // PostgreSQL cannot have the ORDERED BY in the count query.
  411. $query->entityOrderBy('entity_id');
  412. $limit = 20; //variable_get('xmlsitemap_batch_limit', 100)
  413. $query->range(0, $limit);
  414. $result = $query->execute();
  415. $ids = array_keys($result[$entity]);
  416. $info['xmlsitemap']['process callback']($ids);
  417. $context['sandbox']['last_id'] = end($ids);
  418. $context['sandbox']['progress'] += count($ids);
  419. $context['message'] = t('Now processing %entity @last_id (@progress of @count).', array('%entity' => $entity, '@last_id' => $context['sandbox']['last_id'], '@progress' => $context['sandbox']['progress'], '@count' => $context['sandbox']['max']));
  420. if ($context['sandbox']['progress'] >= $context['sandbox']['max']) {
  421. $context['finished'] = 1;
  422. }
  423. else {
  424. $context['finished'] = $context['sandbox']['progress'] / $context['sandbox']['max'];
  425. }
  426. }
  427. /**
  428. * Batch callback; sitemap rebuild finished.
  429. */
  430. function xmlsitemap_rebuild_batch_finished($success, $results, $operations, $elapsed) {
  431. if ($success && !variable_get('xmlsitemap_rebuild_needed', FALSE)) {
  432. drupal_set_message(t('The sitemap links were rebuilt.'));
  433. }
  434. else {
  435. drupal_set_message(t('The sitemap links were not successfully rebuilt.'), 'error');
  436. }
  437. }
  438. function xmlsitemap_get_rebuildable_link_types() {
  439. $rebuild_types = array();
  440. $entities = xmlsitemap_get_link_info();
  441. foreach ($entities as $entity => $info) {
  442. if (empty($info['xmlsitemap']['rebuild callback'])) {
  443. // If the entity is missing a rebuild callback, skip.
  444. continue;
  445. }
  446. if (!empty($info['entity keys']['bundle']) && !xmlsitemap_get_link_type_enabled_bundles($entity)) {
  447. // If the entity has bundles, but no enabled bundles, skip since
  448. // rebuilding wouldn't get any links.
  449. continue;
  450. }
  451. else {
  452. $rebuild_types[] = $entity;
  453. }
  454. }
  455. return $rebuild_types;
  456. }
  457. /**
  458. * Clear all sitemap links for given entity types.
  459. *
  460. * @param array $types
  461. * An array of link types.
  462. * @param bool $save_custom
  463. * A boolean if links with status or priority overridden should not be
  464. * removed (and hence overridden values not lost).
  465. *
  466. * @return int
  467. * The number of deleted links.
  468. */
  469. function xmlsitemap_rebuild_clear(array $types, $save_custom) {
  470. // Let other modules respond to the rebuild clearing.
  471. module_invoke_all('xmlsitemap_rebuild_clear', $types, $save_custom);
  472. $query = db_delete('xmlsitemap');
  473. $query->condition('type', $types);
  474. // If we want to save the custom data, make sure to exclude any links
  475. // that are not using default inclusion or priority.
  476. if ($save_custom) {
  477. $query->condition('status_override', 0);
  478. $query->condition('priority_override', 0);
  479. }
  480. return $query->execute();
  481. }