commit 57719052f3c7da91c37a5e233674c8ffa91c242a Author: Valentin Le Moign Date: Tue May 12 23:33:56 2026 +0200 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9507787 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +# --- OS / éditeur --- +.DS_Store +Thumbs.db +*.swp +*~ +.idea/ +.vscode/ + +# --- Archives --- +*.tar.gz +*.tgz +*.zip diff --git a/README.md b/README.md new file mode 100644 index 0000000..50252c2 --- /dev/null +++ b/README.md @@ -0,0 +1,103 @@ +# thalim-hal-importer + +Plugin WordPress qui importe les publications du laboratoire THALIM depuis l'archive ouverte [HAL](https://hal.science/) (structure `254015`) et les transforme en posts WordPress avec tous les champs Pods renseignés. + +- **Version :** 2.0.0 +- **Auteur :** THALIM Dev +- **Licence :** GPL v2 or later + +## Installation + +```bash +cd wp-content/plugins +git clone gitea@figureslibres.io:valentin_le_moign/thalim-plugin-hal-importer.git thalim-hal-importer +``` + +Puis activer depuis l'admin WordPress. Dans le cadre du projet THALIM, le clonage est automatisé par `bootstrap.sh` du repo [`thalim-stack`](https://figureslibres.io/valentin_le_moign/thalim-stack). + +## Utilisation + +Une fois activé, le plugin ajoute une page d'administration : **Outils → HAL Import** (capacité requise : `edit_others_posts`). + +La page propose deux flux d'import : + +### 1. Aperçu live + import incrémental + +- Filtres : plage de dates (`producedDate_s`) et auteur (idHAL d'un membre THALIM) +- Liste les publications HAL correspondantes avec statut coloré : + - **vert** : déjà importée (présence du meta `hal_id` côté WP) + - **jaune** : prête à être importée (au moins un auteur HAL matche un user WordPress) + - **rouge** : aucun auteur THALIM identifié → ignorée +- Bouton **Importer** : crée tous les posts « prêts » en statut `pending` (à publier après relecture) +- Cache des aperçus en transient (5 min, clé hashée sur les filtres), rafraîchissable manuellement +- Bouton **Test API** pour vérifier la connexion + +### 2. Import en masse via CSV (legacy SPIP) + +Permet d'importer des publications anciennes par lots de 100 : + +- Upload d'un CSV avec une colonne `hal_id` + d'un fichier de contexte SPIP (axes/tags/programmes/owner par publication) +- Traitement par batchs séquentiels (cliquer plusieurs fois) +- Rapport CSV téléchargeable en fin de file +- Annulation possible à tout moment + +## Mapping des types HAL → catégories WordPress + +`DOC_TYPE_MAP` dans `includes/class-importer.php` : + +| Type HAL | Description | Catégorie WP | +| ------------- | ---------------------------- | ------------ | +| `ART` | Article | `16` | +| `COUV` | Chapitre d'ouvrage | `16` | +| `OUV` | Ouvrage | `15` | +| `COMM` | Communication | `13` | +| `ISSUE` | Direction de numéro | `16` | +| `PROCEEDINGS` | Direction d'ouvrage / actes | `15` | +| `THESE` | Thèse | `14` | +| `HDR` | HDR | `14` | +| `SON` | Son | `19` | +| `VIDEO` | Vidéo | `19` | + +`COMM`, `THESE`, `HDR`, `SON`, `VIDEO` sont traités comme événements et utilisent le champ Pods `date_de_debut`. Les autres utilisent `datetime`. + +## Champs HAL → champs WP + +À l'import, chaque publication remplit : + +- **Identification** : `hal_id`, `hal_url` (URI HAL), `lien_externe_1` (PDF si dispo) avec titre `Document HAL // HAL Document` +- **Titre** : `post_title` (issu de HAL) +- **Catégorie** : triple écriture (term assignment + meta `categorie` + meta `_pods_categorie`) +- **Auteurs HAL → membres** : matching via `authIdHal_s` ↔ user meta `identifiant_hal`. Stockage en triple-pattern Pods (`membres` add_post_meta + `_pods_membres`) +- **Date** : `date_de_debut` ou `datetime` selon le type, optionnellement backdaté sur `producedDate_s` +- **Métadonnées** : `journal`, `editeur`, `sous-titre` (book/conference title), `lieu` (city/country pour PROCEEDINGS), `reference_bibliographique` (`citationFull_s`) +- **Fonction** : `fonction_auteur` (varie selon doc type : « Auteur du chapitre // Chapter author », « Direction de numéro // Editor-in-Chief », etc.) +- **Axes thématiques** : cascade `spip_context` → co-auteurs THALIM → owner. Source effective stockée dans `$importer->last_axes_source` +- **Programmes de recherche** et **étiquettes** : depuis `spip_context` (import CSV uniquement) + +## Dédoublonnage + +L'import vérifie le meta `hal_id` avant chaque insertion : une publication ne peut pas être importée deux fois. Le `is_imported($hal_id)` est aussi affiché en colonne de statut dans l'aperçu. + +## Prérequis + +- WordPress 6.0+ +- PHP 7.4+ +- Plugin **Pods** (le pod `post` et le champ user `identifiant_hal`) +- IDs de catégorie WordPress conformes au mapping (8/13/14/15/16/19) — codés en dur dans `DOC_TYPE_MAP` + +## Structure + +``` +. +├── thalim-hal-importer.php # point d'entrée, constantes, bootstrap +└── includes/ + ├── class-hal-api.php # client API HAL (fetch_publications, fetch_by_hal_ids) + ├── class-admin-page.php # UI Tools > HAL Import (aperçu + CSV) + └── class-importer.php # mapping HAL → posts WP (triple-storage, axes cascade) +``` + +## API HAL + +- Base : `https://api.archives-ouvertes.fr/search/` +- Structure THALIM : `254015` +- Documentation : diff --git a/includes/class-admin-page.php b/includes/class-admin-page.php new file mode 100644 index 0000000..9af033b --- /dev/null +++ b/includes/class-admin-page.php @@ -0,0 +1,750 @@ + ['id' => int, 'name' => string] + + // Document type labels + private const DOC_TYPE_LABELS = [ + 'ART' => 'Article', + 'COUV' => "Chapitre d'ouvrage", + 'OUV' => 'Ouvrage', + 'COMM' => 'Communication', + 'ISSUE' => 'Direction de numéro', + 'PROCEEDINGS' => 'Colloque', + 'THESE' => 'Thèse', + 'HDR' => 'HDR', + 'SON' => 'Son', + 'VIDEO' => 'Vidéo', + ]; + + public function __construct() { + $this->api = new Thalim_HAL_API(); + } + + public function render() { + if (!current_user_can('edit_others_posts')) { + wp_die('Unauthorized'); + } + $this->handle_actions(); + echo '

THALIM HAL Importer

'; + $this->render_styles(); + $this->render_message(); + $this->render_config(); + $this->render_preview(); + $this->render_csv_import(); + echo '
'; + } + + private function handle_actions() { + if (!isset($_POST['thalim_hal_action'])) return; + if (!wp_verify_nonce($_POST['_wpnonce'] ?? '', 'thalim_hal_action')) { + $this->message = ['error', 'Security check failed.']; + return; + } + $action = sanitize_text_field($_POST['thalim_hal_action']); + + if ($action === 'test_api') { + $result = $this->api->test_connection(); + $this->message = is_wp_error($result) + ? ['error', 'API Error: ' . $result->get_error_message()] + : ['success', "Connection OK! Found {$result['total']} publications."]; + } + + if ($action === 'refresh') { + // Clear all preview transients (they are keyed by date range hash) + global $wpdb; + $wpdb->query("DELETE FROM {$wpdb->options} WHERE option_name LIKE '_transient_thalim_hal_preview_%'"); + $wpdb->query("DELETE FROM {$wpdb->options} WHERE option_name LIKE '_transient_timeout_thalim_hal_preview_%'"); + $this->message = ['success', 'Preview data refreshed from HAL API.']; + } + + if ($action === 'import_pending') { + $this->handle_import(); + } + + if ($action === 'csv_upload') $this->handle_csv_upload(); + if ($action === 'csv_batch') $this->handle_csv_batch(); + if ($action === 'csv_cancel') $this->handle_csv_cancel(); + if ($action === 'csv_download_report') $this->handle_csv_download_report(); + } + + /** + * Handle bulk import of ready publications as pending posts. + * Uses cached raw HAL docs to avoid a second outbound API call. + */ + private function handle_import() { + $date_from = sanitize_text_field($_POST['hal_date_from'] ?? ''); + $date_to = sanitize_text_field($_POST['hal_date_to'] ?? ''); + $author_hal_id = sanitize_text_field($_POST['hal_author_id'] ?? ''); + + // Reuse the cached preview data — raw_docs are stored alongside processed docs + $preview = $this->get_preview_data($date_from, $date_to, $author_hal_id); + if (is_wp_error($preview)) { + $this->message = ['error', 'API Error: ' . $preview->get_error_message()]; + return; + } + + $raw_docs = $preview['raw_docs'] ?? []; + if (empty($raw_docs)) { + $this->message = ['warning', 'Aucune publication dans le cache. Utilisez Filtrer pour charger les données d\'abord.']; + return; + } + + $this->load_wp_users_hal_ids(); + $importer = new Thalim_HAL_Importer_Logic(); + $imported = 0; + $skipped = 0; + $errors = []; + + foreach ($raw_docs as $doc) { + $hal_id = $doc['halId_s'] ?? ''; + $author_hal_ids = $doc['authIdHal_s'] ?? []; + $matched_users = $this->match_authors_to_users($author_hal_ids); + + if (empty($matched_users) || $importer->is_imported($hal_id)) { + $skipped++; + continue; + } + + $post_id = $importer->import($doc, $this->wp_users_by_hal_id); + if (is_wp_error($post_id)) { + $errors[] = $hal_id . ': ' . $post_id->get_error_message(); + } else { + $imported++; + } + } + + $msg = sprintf('%d publication(s) importée(s) en statut "En attente".', $imported); + if ($skipped) $msg .= sprintf(' %d ignorée(s) (déjà importées ou sans membre THALIM correspondant).', $skipped); + if (!empty($errors)) $msg .= ' Erreurs : ' . implode('; ', $errors); + + $this->message = [empty($errors) ? 'success' : 'warning', $msg]; + } + + private function render_styles() { + ?> + + message) return; + printf('

%s

', + esc_attr($this->message[0]), esc_html($this->message[1])); + } + + private function render_config() { + ?> +
+

Configuration

+ + + + +
Structure ID (THALIM)
Document Types
API Endpointapi->get_api_url(10)); ?>
+
+ + + +
+
+ load_wp_users_hal_ids(); + + $preview = $this->get_preview_data($date_from, $date_to, $author_hal_id); + $ready_count = is_wp_error($preview) ? 0 : $preview['stats']['ready']; + ?> +
+

Import Preview

+ +
+ + + + + + + + + + + Mis en cache 5 min + + + + +
+ + +

get_error_message()); ?>

+ + render_wp_users_debug(); ?> + render_summary($preview['stats']); ?> + render_preview_table($preview['docs']); ?> + render_legend(); ?> + +
+ +
+
+ + Total in HAL +
+
+ + Already Imported +
+
+ + Ready to Import +
+
+ + No Matched User +
+
+ No publications found.

'; + return; + } + ?> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
StatutHAL IDTitreTypeAuteursIDs HAL auteursDateMembres THALIMLien HAL
get_status_icon($doc); ?> + + +
+ +
+ + + + + + + + wp_users_by_hal_id[$normalized]); + ?> + + + + aucun + + + + + + + + Aucun + + + Voir sur HAL +
+ +
+ Légende : + ✓ Importé + ★ Prêt Membre THALIM identifié + ✗ Bloqué Aucun membre THALIM ne correspond aux IDs auteurs HAL +
+ load_wp_users_hal_ids(); + if (empty($this->wp_users_by_hal_id)) { + echo '

Aucun utilisateur WordPress n\'a le champ identifiant_hal renseigné.

'; + return; + } + ?> +
+ + Utilisateurs WordPress avec identifiant HAL (wp_users_by_hal_id); ?> utilisateurs) — Cliquer pour déplier + + + + + wp_users_by_hal_id as $hal_id => $user): ?> + + + + + + + + +
UtilisateurIdentifiant HALDebug (brut)Modifier
(ID : )"" ( car.)Modifier
+
+ api->fetch_publications($rows, 0, 'producedDate_tdate desc', $date_from, $date_to, $author_hal_id); + if (is_wp_error($result)) return $result; + + $importer = new Thalim_HAL_Importer_Logic(); + $this->load_wp_users_hal_ids(); + + $preview = [ + 'stats' => [ + 'total' => $result['response']['numFound'] ?? 0, + 'imported' => 0, + 'ready' => 0, + 'blocked' => 0 + ], + 'docs' => [], + 'raw_docs' => [], // Raw HAL docs kept for import, avoids a second API call + ]; + + foreach ($result['response']['docs'] ?? [] as $doc) { + $hal_id = $doc['halId_s'] ?? ''; + $is_imported = $importer->is_imported($hal_id); + $author_hal_ids = $doc['authIdHal_s'] ?? []; + $matched_users = $this->match_authors_to_users($author_hal_ids); + $has_match = !empty($matched_users); + + // Update stats + if ($is_imported) { + $preview['stats']['imported']++; + } elseif ($has_match) { + $preview['stats']['ready']++; + } else { + $preview['stats']['blocked']++; + } + + $preview['docs'][] = [ + 'hal_id' => $hal_id, + 'title' => $doc['title_s'][0] ?? 'N/A', + 'type' => $doc['docType_s'] ?? '', + 'authors' => $doc['authFullName_s'] ?? [], + 'author_hal_ids' => $author_hal_ids, + 'publication_date' => $doc['publicationDate_s'] ?? '', + 'produced_date' => $doc['submittedDate_s'] ?? '', + 'journal' => $doc['journalTitle_s'] ?? $doc['bookTitle_s'] ?? '', + 'url' => $doc['uri_s'] ?? '', + 'is_imported' => $is_imported, + 'matched_users' => $matched_users, + 'has_match' => $has_match, + ]; + $preview['raw_docs'][] = $doc; // Full HAL doc kept for import + } + + set_transient($cache_key, $preview, 300); + return $preview; + } + + /** + * Load all WordPress users with HAL IDs into cache. + * Stores: normalized_hal_id => ['id' => int, 'name' => string] + */ + private function load_wp_users_hal_ids() { + if ($this->wp_users_by_hal_id !== null) return; + + $this->wp_users_by_hal_id = []; + $users = get_users([ + 'meta_key' => 'identifiant_hal', + 'meta_compare' => 'EXISTS' + ]); + + foreach ($users as $user) { + $hal_id = get_user_meta($user->ID, 'identifiant_hal', true); + if (!empty($hal_id)) { + $normalized = strtolower(trim($hal_id)); + $this->wp_users_by_hal_id[$normalized] = [ + 'id' => $user->ID, + 'name' => $user->display_name, + 'hal_id' => trim($hal_id), // original value for API filter + ]; + } + } + } + + /** + * Match HAL author IDs to WordPress users. + * Returns array of display names (for preview display). + */ + private function match_authors_to_users($author_hal_ids) { + $matched = []; + foreach ($author_hal_ids as $hal_id) { + $normalized = strtolower(trim($hal_id)); + if (isset($this->wp_users_by_hal_id[$normalized])) { + $matched[] = $this->wp_users_by_hal_id[$normalized]['name']; + } + } + return $matched; + } + + // ======================================================================== + // CSV bulk import (phase 2 — legacy publications from SPIP) + // ======================================================================== + + private const CSV_QUEUE_OPTION = 'thalim_hal_csv_queue'; + private const CSV_BATCH_SIZE = 100; + + private function render_csv_import() { + $queue = get_option(self::CSV_QUEUE_OPTION, null); + ?> +
+

Import en masse depuis CSV

+

+ Uploader le couple hal-to-import.csv + hal-to-import-context.json + (généré par php scripts/prepare-csv-context.php) pour importer les publications legacy. + Chaque batch traite publications — cliquer plusieurs fois jusqu'à terminaison. +

+ + +
+ + + + + + + + + + + + + + + + + + + +
Fichier CSV
Fichier contexte (JSON) +

Généré par scripts/prepare-csv-context.php.

Statut des posts + + +
Date du post WP + +
+

+
+ + render_csv_progress($queue); ?> + +
+ 0 ? round(100 * $done / $total, 1) : 0; + $report_ct = count($queue['report'] ?? []); + ?> +
+

File d'attente active — statut cible : + — backdate :

+

+ / + publications traitées (%) + — restantes +

+ +

Dernière mise à jour :

+ + + +

+ Erreur dernier batch : +

+ + +
+ + + +
+ + 0): ?> +
+ + + +
+ + +
+ + + +
+
+ message = ['error', 'CSV ou fichier contexte manquant.']; + return; + } + + // Parse CSV -> list of hal_ids + $fh = fopen($_FILES['csv_file']['tmp_name'], 'r'); + if (!$fh) { $this->message = ['error', 'Impossible de lire le CSV.']; return; } + $header = fgetcsv($fh); + $hal_col = array_search('hal_id', $header); + $spip_col = array_search('spip_id', $header); + if ($hal_col === false) { + fclose($fh); + $this->message = ['error', 'Header CSV : colonne hal_id manquante.']; + return; + } + $hal_ids = []; + $spip_map = []; // hal_id => spip_id + while (($row = fgetcsv($fh)) !== false) { + $hid = trim($row[$hal_col] ?? ''); + if ($hid === '') continue; + $hal_ids[] = $hid; + if ($spip_col !== false) $spip_map[$hid] = trim($row[$spip_col] ?? ''); + } + fclose($fh); + $hal_ids = array_values(array_unique($hal_ids)); + + // Parse JSON context + $ctx_raw = file_get_contents($_FILES['ctx_file']['tmp_name']); + $ctx_data = json_decode($ctx_raw, true); + if (!is_array($ctx_data) || !isset($ctx_data['ctx'])) { + $this->message = ['error', 'Fichier contexte JSON invalide.']; + return; + } + + $status = ($_POST['post_status'] ?? 'publish') === 'pending' ? 'pending' : 'publish'; + $backdate = !empty($_POST['backdate_post']); + + $queue = [ + 'hal_ids' => $hal_ids, + 'spip_map' => $spip_map, + 'status' => $status, + 'backdate' => $backdate, + 'total' => count($hal_ids), + 'done' => 0, + 'spip_ctx' => $ctx_data['ctx'], + 'wp_users_by_hal_id' => $ctx_data['wp_users_by_hal_id'] ?? [], + 'report' => [], + 'last_error' => '', + 'updated_at' => current_time('mysql'), + ]; + update_option(self::CSV_QUEUE_OPTION, $queue, false); + $this->message = ['success', sprintf( + 'CSV chargé : %d publications prêtes. Statut cible : %s. Cliquer "Traiter le prochain batch" pour lancer.', + count($hal_ids), $status + )]; + } + + private function handle_csv_batch(): void { + $queue = get_option(self::CSV_QUEUE_OPTION, null); + if (!$queue) { $this->message = ['error', 'Aucune queue active.']; return; } + + $batch = array_slice($queue['hal_ids'], $queue['done'], self::CSV_BATCH_SIZE); + if (empty($batch)) { + $this->message = ['success', 'Import terminé — tous les batches ont été traités.']; + return; + } + + $docs = $this->api->fetch_by_hal_ids($batch, self::CSV_BATCH_SIZE); + if (is_wp_error($docs)) { + $queue['last_error'] = $docs->get_error_message(); + $queue['updated_at'] = current_time('mysql'); + update_option(self::CSV_QUEUE_OPTION, $queue, false); + $this->message = ['error', 'Erreur HAL API : ' . $docs->get_error_message()]; + return; + } + + // Normalize wp_users_by_hal_id keys to lowercase for the importer + $users_map = []; + foreach ($queue['wp_users_by_hal_id'] as $hid => $u) { + $users_map[strtolower(trim((string) $hid))] = $u; + } + + $importer = new Thalim_HAL_Importer_Logic(); + $batch_imported = 0; + $batch_skipped = 0; + $batch_errors = 0; + + foreach ($batch as $hal_id) { + $spip_id = $queue['spip_map'][$hal_id] ?? ''; + $doc = $docs[$hal_id] ?? null; + $ctx = $queue['spip_ctx'][$hal_id] ?? []; + + if (!$doc) { + $queue['report'][] = [$hal_id, $spip_id, '', 'not_found_in_hal', 'false', 'none', 'HAL API did not return this hal_id']; + $batch_errors++; + continue; + } + + $post_id = $importer->import($doc, $users_map, $queue['status'], (bool) $queue['backdate'], $ctx); + if (is_wp_error($post_id)) { + $code = $post_id->get_error_code(); + $queue['report'][] = [$hal_id, $spip_id, '', $code, 'false', 'none', $post_id->get_error_message()]; + if ($code === 'exists') $batch_skipped++; + else $batch_errors++; + } else { + $source = $importer->last_axes_source; + $has_axe = $source !== 'none' ? 'true' : 'false'; + $queue['report'][] = [$hal_id, $spip_id, (string) $post_id, 'imported', $has_axe, $source, '']; + $batch_imported++; + } + } + + $queue['done'] += count($batch); + $queue['last_error'] = ''; + $queue['updated_at'] = current_time('mysql'); + update_option(self::CSV_QUEUE_OPTION, $queue, false); + + $this->message = ['success', sprintf( + 'Batch traité : %d importé(s), %d déjà importé(s), %d erreur(s). Progression : %d / %d.', + $batch_imported, $batch_skipped, $batch_errors, + $queue['done'], $queue['total'] + )]; + } + + private function handle_csv_cancel(): void { + delete_option(self::CSV_QUEUE_OPTION); + $this->message = ['success', 'Queue CSV annulée.']; + } + + private function handle_csv_download_report(): void { + $queue = get_option(self::CSV_QUEUE_OPTION, null); + if (!$queue || empty($queue['report'])) { + $this->message = ['warning', 'Aucun rapport à télécharger.']; + return; + } + $filename = 'hal-import-report-' . date('Ymd-His') . '.csv'; + header('Content-Type: text/csv; charset=utf-8'); + header('Content-Disposition: attachment; filename="' . $filename . '"'); + $out = fopen('php://output', 'w'); + fputcsv($out, ['hal_id', 'spip_id', 'post_id', 'status', 'has_axe', 'axes_source', 'error']); + foreach ($queue['report'] as $row) fputcsv($out, $row); + fclose($out); + exit; + } + + // ======================================================================== + // End CSV bulk import + // ======================================================================== + + private function get_row_class($doc) { + if ($doc['is_imported']) return 'hal-status-imported'; + if ($doc['has_match']) return 'hal-status-ready'; + return 'hal-status-blocked'; + } + + private function get_status_icon($doc) { + if ($doc['is_imported']) return ''; + if ($doc['has_match']) return ''; + return ''; + } +} diff --git a/includes/class-hal-api.php b/includes/class-hal-api.php new file mode 100644 index 0000000..026092c --- /dev/null +++ b/includes/class-hal-api.php @@ -0,0 +1,137 @@ +build_url($rows, $start, $sort, $date_from, $date_to, $author_hal_id); + return $this->request($url); + } + + /** + * Fetch full HAL docs by a list of hal_ids (batched). + * Uses Solr fq=halId_s:(id1 OR id2 OR ...) syntax. No structId filter — + * fetch by halId exact, regardless of structure. + * + * @param string[] $hal_ids HAL IDs to fetch. + * @param int $batch Batch size (default 100). + * @return array|WP_Error Array keyed by halId_s, or WP_Error on failure. + */ + public function fetch_by_hal_ids(array $hal_ids, int $batch = 100) { + $docs = []; + $chunks = array_chunk(array_values(array_unique($hal_ids)), $batch); + foreach ($chunks as $chunk) { + $filter = 'halId_s:(' . implode(' OR ', $chunk) . ')'; + $params = [ + 'q=' . urlencode('*:*'), + 'fq=' . urlencode($filter), + 'rows=' . count($chunk), + 'fl=' . urlencode(self::FIELDS), + 'wt=json', + ]; + $url = THALIM_HAL_API_BASE . '?' . implode('&', $params); + $data = $this->request($url); + if (is_wp_error($data)) return $data; + foreach ($data['response']['docs'] ?? [] as $doc) { + if (!empty($doc['halId_s'])) { + $docs[$doc['halId_s']] = $doc; + } + } + // Be polite with HAL if we have multiple chunks + if (count($chunks) > 1) usleep(250000); + } + return $docs; + } + + /** + * Test API connection + */ + public function test_connection() { + $result = $this->fetch_publications(5); + if (is_wp_error($result)) return $result; + return [ + 'success' => true, + 'total' => $result['response']['numFound'] ?? 0, + 'sample' => $result['response']['docs'] ?? [] + ]; + } + + /** + * Build API URL with proper fq parameter handling + * + * @param int $rows + * @param int $start + * @param string $sort + * @param string $date_from YYYY-MM-DD or empty + * @param string $date_to YYYY-MM-DD or empty + */ + private function build_url($rows = 5, $start = 0, $sort = 'modifiedDate_tdate desc', $date_from = '', $date_to = '', $author_hal_id = '') { + $doc_types = implode(' OR ', THALIM_HAL_DOC_TYPES); + + $from = $date_from ? $date_from . 'T00:00:00Z' : '*'; + $to = $date_to ? $date_to . 'T23:59:59Z' : '*'; + + $params = [ + 'q=' . urlencode('*:*'), + 'fq=' . urlencode('structId_i:' . THALIM_HAL_STRUCT_ID), + 'fq=' . urlencode('docType_s:(' . $doc_types . ')'), + ]; + + if ($from !== '*' || $to !== '*') { + $params[] = 'fq=' . urlencode('producedDate_tdate:[' . $from . ' TO ' . $to . ']'); + } + + if ($author_hal_id !== '') { + $params[] = 'fq=' . urlencode('authIdHal_s:' . $author_hal_id); + } + + $params = array_merge($params, [ + 'rows=' . intval($rows), + 'start=' . intval($start), + 'sort=' . urlencode($sort), + 'fl=' . urlencode(self::FIELDS), + 'wt=json' + ]); + + return THALIM_HAL_API_BASE . '?' . implode('&', $params); + } + + /** + * Get API URL for debugging display + */ + public function get_api_url($rows = 5) { + return $this->build_url($rows, 0); + } + + /** + * Make HTTP request + */ + private function request($url) { + $response = wp_remote_get($url, ['timeout' => 30, 'headers' => ['Accept' => 'application/json']]); + if (is_wp_error($response)) return $response; + + $code = wp_remote_retrieve_response_code($response); + if ($code !== 200) return new WP_Error('api_error', "HTTP $code"); + + $data = json_decode(wp_remote_retrieve_body($response), true); + return json_last_error() === JSON_ERROR_NONE ? $data : new WP_Error('json_error', 'Invalid JSON'); + } +} diff --git a/includes/class-importer.php b/includes/class-importer.php new file mode 100644 index 0000000..76ba277 --- /dev/null +++ b/includes/class-importer.php @@ -0,0 +1,523 @@ + WordPress category ID + private const DOC_TYPE_MAP = [ + 'ART' => 16, // Article + 'COUV' => 16, // Chapitre -> Articles + 'OUV' => 15, // Ouvrage -> Ouvrages + 'COMM' => 13, // Communication -> Communications + 'ISSUE' => 16, // Direction de numéro -> Articles + 'PROCEEDINGS' => 15, // Direction d'ouvrage/Proceedings -> Ouvrages + 'THESE' => 14, // Thèse -> Soutenances + 'HDR' => 14, // HDR -> Soutenances + 'SON' => 19, // Son -> Captations audio/vidéo + 'VIDEO' => 19, // Vidéo -> Captations audio/vidéo + ]; + + // Doc types that use date_de_debut instead of datetime + private const EVENT_DOC_TYPES = ['COMM', 'THESE', 'HDR', 'SON', 'VIDEO']; + + // Pods IDs — queried from the DB, stable per installation + private const POD_ID_POST = 8; + private const FIELD_ID_CATEGORIE = 16; // "Type d'annonce" (picks from WP category) + private const FIELD_ID_MEMBRES = 178; + private const FIELD_ID_AUTRE_MBRES = 195; // autre_membres (unused in import, for reference) + private const FIELD_ID_AXES = 270; // axes_thematiques (picks from axe_thematique) + private const FIELD_ID_PROGRAMMES = 271; // programmes_de_recherche (picks from programme_de_recherche) + private const FIELD_ID_ETIQUETTES = 652; // étiquettes (picks from post_tag) + + /** Source of the axes applied on the last import(): 'spip' | 'coauthors' | 'owner' | 'none'. */ + public $last_axes_source = 'none'; + + /** + * Check if publication is already imported + */ + public function is_imported($hal_id) { + if (empty($hal_id)) return false; + global $wpdb; + return (int) $wpdb->get_var($wpdb->prepare( + "SELECT COUNT(*) FROM {$wpdb->postmeta} WHERE meta_key = 'hal_id' AND meta_value = %s", + $hal_id + )) > 0; + } + + /** + * Get category ID for HAL doc type + */ + public function get_category_id($doc_type) { + return self::DOC_TYPE_MAP[$doc_type] ?? null; + } + + /** + * Get doc type mappings + */ + public function get_doc_type_map() { + return self::DOC_TYPE_MAP; + } + + /** + * Import a HAL publication as a WordPress post. + * + * @param array $hal_doc Raw HAL API document. + * @param array $wp_users_by_hal_id Map of normalized_hal_id => ['id' => int, 'name' => string]. + * @param string $post_status Target post_status (default 'pending'). + * @param bool $backdate_post Use producedDate_s as post_date (default false). + * @param array $spip_context SPIP-derived context for bulk imports: + * ['axes' => int[], 'tags' => int[], 'programmes' => int[], 'owner_user_id' => ?int] + * @return int|WP_Error New post ID on success, WP_Error on failure. + * The axes source is stored in $this->last_axes_source for caller reporting. + */ + public function import( + array $hal_doc, + array $wp_users_by_hal_id = [], + string $post_status = 'pending', + bool $backdate_post = false, + array $spip_context = [] + ) { + $hal_id = $hal_doc['halId_s'] ?? ''; + $doc_type = $hal_doc['docType_s'] ?? ''; + + if (empty($hal_id)) return new WP_Error('no_id', 'Missing HAL ID'); + if ($this->is_imported($hal_id)) return new WP_Error('exists', 'Already imported: ' . $hal_id); + + // --- Resolve post author from HAL author IDs --- + $author_hal_ids = $hal_doc['authIdHal_s'] ?? []; + $author_names = $hal_doc['authFullName_s'] ?? []; + $matched_user_ids = []; + $matched_user_names = []; + foreach ($author_hal_ids as $hal_author_id) { + $normalized = strtolower(trim($hal_author_id)); + if (isset($wp_users_by_hal_id[$normalized])) { + $user = $wp_users_by_hal_id[$normalized]; + $matched_user_ids[] = $user['id']; + $matched_user_names[] = $user['name']; + } + } + $post_author = !empty($matched_user_ids) ? $matched_user_ids[0] : 1; + + // --- Create the post --- + $post_args = [ + 'post_title' => wp_strip_all_tags($hal_doc['title_s'][0] ?? ''), + 'post_content' => wp_kses_post($hal_doc['abstract_s'][0] ?? ''), + 'post_status' => $post_status, + 'post_type' => 'post', + 'post_author' => $post_author, + ]; + + // Backdate post_date to HAL producedDate_s when requested (for legacy bulk imports) + if ($backdate_post) { + $backdate_ymd = $this->parse_hal_date($hal_doc['producedDate_s'] ?? ''); + if ($backdate_ymd) { + $post_args['post_date'] = $backdate_ymd . ' 12:00:00'; + $post_args['post_date_gmt'] = $backdate_ymd . ' 12:00:00'; + } + } + + $post_id = wp_insert_post($post_args, true); + if (is_wp_error($post_id)) return $post_id; + + // --- Category — Pods triple-storage pattern --- + $cat_id = self::DOC_TYPE_MAP[$doc_type] ?? null; + if ($cat_id) { + global $wpdb; + + // 1. Native WP category assignment + wp_set_post_categories($post_id, [$cat_id]); + + // 2. Pods postmeta: single integer value + update_post_meta($post_id, 'categorie', $cat_id); + + // 3. Pods _pods_ meta: serialized array of one integer + update_post_meta($post_id, '_pods_categorie', [$cat_id]); + + // 4. wp_podsrel row + $wpdb->insert( + $wpdb->prefix . 'podsrel', + [ + 'pod_id' => self::POD_ID_POST, + 'field_id' => self::FIELD_ID_CATEGORIE, + 'item_id' => $post_id, + 'related_pod_id' => 0, + 'related_field_id'=> 0, + 'related_item_id' => $cat_id, + 'weight' => 0, + ], + ['%d', '%d', '%d', '%d', '%d', '%d', '%d'] + ); + } + + // --- Core meta --- + update_post_meta($post_id, 'hal_id', $hal_id); + update_post_meta($post_id, 'hal_url', $hal_doc['uri_s'] ?? ''); + + // HAL PDF file -> lien_externe_1 + $file_url = $hal_doc['fileMain_s'] ?? ''; + if ($file_url) { + update_post_meta($post_id, 'lien_externe_1', $file_url); + update_post_meta($post_id, 'titre_du_lien_externe_1', 'Document HAL // HAL Document'); + } + + // Journal (ART) + $journal = $hal_doc['journalTitle_s'] ?? ''; + if ($journal) { + update_post_meta($post_id, 'journal', $journal); + } + + // Book title as sous-titre (COUV), only if different from post title + $book_title = $hal_doc['bookTitle_s'] ?? ''; + $post_title = $hal_doc['title_s'][0] ?? ''; + if ($book_title && $book_title !== $post_title) { + update_post_meta($post_id, 'sous-titre', $book_title); + } + + // Publisher -> editeur (plain text, no Pods triple-storage needed) + $publisher = $hal_doc['publisher_s'] ?? ''; + if (is_array($publisher)) $publisher = $publisher[0] ?? ''; + if ($publisher) { + update_post_meta($post_id, 'editeur', $publisher); + } + + // Fonction label: bilingual plain text (only relevant for cats 4, 15, 16) + if ($doc_type === 'COUV') { + update_post_meta($post_id, 'fonction_auteur', 'Auteur du chapitre // Chapter author'); + } elseif ($doc_type === 'ISSUE') { + update_post_meta($post_id, 'fonction_auteur', 'Direction de numéro // Editor-in-Chief'); + } + + // --- Keywords -> étiquettes (Pods triple-storage, picks from post_tag) --- + $hal_keywords = $hal_doc['keyword_s'] ?? []; + if (!empty($hal_keywords)) { + $matched_term_ids = $this->match_keywords_to_tags($hal_keywords); + if (!empty($matched_term_ids)) { + global $wpdb; + + // 1. Native WP term relationship + wp_set_object_terms($post_id, $matched_term_ids, 'post_tag', true); + + // 2. Individual postmeta rows (one per term ID) + foreach ($matched_term_ids as $tid) { + add_post_meta($post_id, 'etiquettes', (string) $tid); + } + + // 3. _pods_etiquettes: serialized array of term IDs as integers + update_post_meta($post_id, '_pods_etiquettes', array_map('intval', $matched_term_ids)); + + // 4. wp_podsrel rows + foreach ($matched_term_ids as $weight => $tid) { + $wpdb->insert( + $wpdb->prefix . 'podsrel', + [ + 'pod_id' => self::POD_ID_POST, + 'field_id' => self::FIELD_ID_ETIQUETTES, + 'item_id' => $post_id, + 'related_pod_id' => 0, + 'related_field_id' => 0, + 'related_item_id' => (int) $tid, + 'weight' => $weight, + ], + ['%d', '%d', '%d', '%d', '%d', '%d', '%d'] + ); + } + } + } + + // --- Date meta --- + $date_raw = $hal_doc['producedDate_s'] ?? ''; + // THESE/HDR: use defenseDate_s if available, fallback to producedDate_s + if (in_array($doc_type, ['THESE', 'HDR'])) { + $defense = $hal_doc['defenseDate_s'] ?? ''; + if ($defense) $date_raw = $defense; + } + $date_meta = $this->parse_hal_date($date_raw); + if ($date_meta) { + $date_field = in_array($doc_type, self::EVENT_DOC_TYPES) ? 'date_de_debut' : 'datetime'; + update_post_meta($post_id, $date_field, $date_meta); + } + + // --- Type pick fields (pick custom-simple — no triple-storage) --- + $type_picks = [ + 'PROCEEDINGS' => ['type_colloque', 'Colloque'], + 'THESE' => ['type_soutenance', 'Soutenance de thèse'], + 'HDR' => ['type_soutenance', "Soutenance d'habilitation"], + 'SON' => ['type_captation', 'Son'], + 'VIDEO' => ['type_captation', 'Vidéo'], + ]; + if (isset($type_picks[$doc_type])) { + [$field, $value] = $type_picks[$doc_type]; + update_post_meta($post_id, $field, $value); + } + + // --- Lieu for PROCEEDINGS (city, country from HAL) --- + if ($doc_type === 'PROCEEDINGS') { + $city = $hal_doc['city_s'] ?? ''; + $country = $hal_doc['country_s'] ?? ''; + if (is_array($city)) $city = $city[0] ?? ''; + if (is_array($country)) $country = $country[0] ?? ''; + $lieu = trim("$city, $country", ', '); + if ($lieu) { + update_post_meta($post_id, 'lieu', $lieu); + } + } + + // --- Conference title as sous-titre for PROCEEDINGS --- + if ($doc_type === 'PROCEEDINGS') { + $conf_title = $hal_doc['conferenceTitle_s'] ?? ''; + if ($conf_title) { + update_post_meta($post_id, 'sous-titre', $conf_title); + } + } + + // --- Reference bibliographique from citationFull_s (cats 4, 15, 16) --- + $citation = $hal_doc['citationFull_s'] ?? ''; + if ($citation && in_array($cat_id, [4, 15, 16])) { + update_post_meta($post_id, 'reference_bibliographique', wp_kses_post($citation)); + } + + // --- Store matched THALIM members — Pods triple-storage pattern + if (!empty($matched_user_ids)) { + global $wpdb; + + // 1. Individual postmeta rows (one per user ID, as string) + foreach ($matched_user_ids as $uid) { + add_post_meta($post_id, 'membres', (string) $uid); + } + + // 2. _pods_ meta: serialized PHP array of user IDs as integers + update_post_meta($post_id, '_pods_membres', array_map('intval', $matched_user_ids)); + + // 3. wp_podsrel rows (one per user, weight = position) + foreach ($matched_user_ids as $weight => $uid) { + $wpdb->insert( + $wpdb->prefix . 'podsrel', + [ + 'pod_id' => self::POD_ID_POST, + 'field_id' => self::FIELD_ID_MEMBRES, + 'item_id' => $post_id, + 'related_pod_id' => 0, + 'related_field_id'=> 0, + 'related_item_id' => (int) $uid, + 'weight' => $weight, + ], + ['%d', '%d', '%d', '%d', '%d', '%d', '%d'] + ); + } + } + + // --- Axes thématiques : cascade (SPIP direct > co-auteurs > owner) --- + $axes_resolution = $this->resolve_axes_cascade($matched_user_ids, $spip_context); + $this->last_axes_source = $axes_resolution['source']; + if (!empty($axes_resolution['term_ids'])) { + $this->set_pods_taxonomy_multi( + $post_id, 'axes_thematiques', self::FIELD_ID_AXES, + $axes_resolution['term_ids'], 'axe_thematique' + ); + } + + // --- Programmes de recherche : SPIP direct OR keyword matching --- + $prog_ids = !empty($spip_context['programmes']) + ? array_map('intval', $spip_context['programmes']) + : $this->match_terms_by_keywords($hal_doc['keyword_s'] ?? [], 'programme_de_recherche'); + if (!empty($prog_ids)) { + $this->set_pods_taxonomy_multi( + $post_id, 'programmes_de_recherche', self::FIELD_ID_PROGRAMMES, + $prog_ids, 'programme_de_recherche' + ); + } + + // --- Étiquettes SPIP directes (en plus du matching HAL déjà fait plus haut) --- + if (!empty($spip_context['tags'])) { + // Merge avec les tags déjà posés par le bloc étiquettes plus haut + $existing = wp_get_object_terms($post_id, 'post_tag', ['fields' => 'ids']); + $merged = array_values(array_unique(array_merge( + is_array($existing) ? array_map('intval', $existing) : [], + array_map('intval', $spip_context['tags']) + ))); + $this->set_pods_taxonomy_multi( + $post_id, 'etiquettes', self::FIELD_ID_ETIQUETTES, + array_diff($merged, is_array($existing) ? $existing : []), + 'post_tag' + ); + } + + // Unmatched authors as free text — remove matched names from the full list + $unmatched = array_filter($author_names, function($name) use ($matched_user_names) { + foreach ($matched_user_names as $matched) { + // Loose comparison: ignore case and extra spaces + if (mb_strtolower(trim($name)) === mb_strtolower(trim($matched))) { + return false; + } + } + return true; + }); + if (!empty($unmatched)) { + update_post_meta($post_id, 'autrepersonnes', implode(', ', array_values($unmatched))); + } + + // --- Polylang: assign French language --- + if (function_exists('pll_set_post_language')) { + pll_set_post_language($post_id, 'fr'); + } + + return $post_id; + } + + /** + * Match HAL keyword strings against existing WordPress terms in a given taxonomy. + * + * WP terms are often stored bilingually as "Terme FR // English term". + * Matching is case-insensitive against both the FR and EN parts. + * + * @param string[] $hal_keywords Raw keyword strings from HAL keyword_s field. + * @param string $taxonomy WordPress taxonomy (e.g. 'post_tag', 'programme_de_recherche'). + * @return int[] Matched term IDs. + */ + private function match_terms_by_keywords(array $hal_keywords, string $taxonomy = 'post_tag'): array { + if (empty($hal_keywords)) return []; + + $terms = get_terms(['taxonomy' => $taxonomy, 'hide_empty' => false]); + if (is_wp_error($terms) || empty($terms)) return []; + + // Normalise HAL keywords once for comparison + $hal_lower = array_map(fn($kw) => mb_strtolower(trim($kw)), $hal_keywords); + + $matched = []; + foreach ($terms as $term) { + $parts = explode(' // ', $term->name, 2); + $fr = mb_strtolower(trim($parts[0])); + $en = isset($parts[1]) ? mb_strtolower(trim($parts[1])) : null; + + if (in_array($fr, $hal_lower, true) || + ($en !== null && in_array($en, $hal_lower, true))) { + $matched[] = (int) $term->term_id; + } + } + + return $matched; + } + + /** + * Backwards-compatible alias for the renamed method. + */ + private function match_keywords_to_tags(array $hal_keywords): array { + return $this->match_terms_by_keywords($hal_keywords, 'post_tag'); + } + + /** + * Resolve axes thématiques through a cascade of strategies. + * + * 1. Direct SPIP links ($spip_context['axes']) + * 2. Axes from all matched WP co-authors (_pods_axes_thematiques) + * 3. Axe of the SPIP flux owner user ($spip_context['owner_user_id']) + * + * @return array{source: string, term_ids: int[]} + */ + private function resolve_axes_cascade(array $matched_user_ids, array $spip_context): array { + // 1. SPIP direct + if (!empty($spip_context['axes'])) { + $ids = array_values(array_unique(array_map('intval', $spip_context['axes']))); + if (!empty($ids)) return ['source' => 'spip', 'term_ids' => $ids]; + } + + // 2. Co-authors matched (any matched THALIM member with an axe) + $from_authors = []; + foreach ($matched_user_ids as $uid) { + $axes = get_user_meta((int) $uid, '_pods_axes_thematiques', true); + if (is_array($axes)) { + foreach ($axes as $tid) $from_authors[] = (int) $tid; + } elseif (is_string($axes) && $axes !== '') { + $unser = @unserialize($axes); + if (is_array($unser)) { + foreach ($unser as $tid) $from_authors[] = (int) $tid; + } + } + } + $from_authors = array_values(array_unique(array_filter($from_authors))); + if (!empty($from_authors)) { + return ['source' => 'coauthors', 'term_ids' => $from_authors]; + } + + // 3. SPIP flux owner user + if (!empty($spip_context['owner_user_id'])) { + $axes = get_user_meta((int) $spip_context['owner_user_id'], '_pods_axes_thematiques', true); + $ids = []; + if (is_array($axes)) { + $ids = array_map('intval', $axes); + } elseif (is_string($axes) && $axes !== '') { + $unser = @unserialize($axes); + if (is_array($unser)) $ids = array_map('intval', $unser); + } + $ids = array_values(array_unique(array_filter($ids))); + if (!empty($ids)) return ['source' => 'owner', 'term_ids' => $ids]; + } + + return ['source' => 'none', 'term_ids' => []]; + } + + /** + * Parse a HAL date (YYYY, YYYY-MM, YYYY-MM-DD, or ISO datetime) to Y-m-d. + * Returns '' on failure. HAL often emits partial dates that strtotime + * mishandles (e.g. strtotime("2022") interprets 2022 as a time, not a year). + */ + private function parse_hal_date(string $raw): string { + $raw = trim($raw); + if ($raw === '') return ''; + if (preg_match('/^(\d{4})-(\d{2})-(\d{2})/', $raw, $m)) { + return "{$m[1]}-{$m[2]}-{$m[3]}"; + } + if (preg_match('/^(\d{4})-(\d{2})$/', $raw, $m)) { + return "{$m[1]}-{$m[2]}-01"; + } + if (preg_match('/^(\d{4})$/', $raw, $m)) { + return "{$m[1]}-01-01"; + } + $ts = strtotime($raw); + return $ts ? date('Y-m-d', $ts) : ''; + } + + /** + * Generic Pods triple-storage writer for multi-value taxonomy fields. + * Writes to: wp_term_relationships, postmeta rows, _pods_ meta, wp_podsrel. + */ + private function set_pods_taxonomy_multi(int $post_id, string $field_name, int $field_id, array $term_ids, string $taxonomy): void { + if (empty($term_ids)) return; + global $wpdb; + $term_ids = array_values(array_unique(array_map('intval', $term_ids))); + + // 1. wp_term_relationships + wp_set_object_terms($post_id, $term_ids, $taxonomy, true); + + // 2. postmeta (one row per term ID, as string) + foreach ($term_ids as $tid) { + add_post_meta($post_id, $field_name, (string) $tid); + } + + // 3. _pods_ meta: serialized array of ints + update_post_meta($post_id, '_pods_' . $field_name, $term_ids); + + // 4. wp_podsrel rows (weight = position) + foreach ($term_ids as $weight => $tid) { + $wpdb->insert( + $wpdb->prefix . 'podsrel', + [ + 'pod_id' => self::POD_ID_POST, + 'field_id' => $field_id, + 'item_id' => $post_id, + 'related_pod_id' => 0, + 'related_field_id' => 0, + 'related_item_id' => (int) $tid, + 'weight' => $weight, + ], + ['%d', '%d', '%d', '%d', '%d', '%d', '%d'] + ); + } + } +} diff --git a/thalim-hal-importer.php b/thalim-hal-importer.php new file mode 100644 index 0000000..d7b7613 --- /dev/null +++ b/thalim-hal-importer.php @@ -0,0 +1,86 @@ +load_dependencies(); + $this->init_hooks(); + } + + private function load_dependencies() { + require_once THALIM_HAL_PLUGIN_DIR . 'includes/class-hal-api.php'; + require_once THALIM_HAL_PLUGIN_DIR . 'includes/class-admin-page.php'; + require_once THALIM_HAL_PLUGIN_DIR . 'includes/class-importer.php'; + } + + private function init_hooks() { + add_action('admin_menu', [$this, 'add_admin_menu']); + } + + public function add_admin_menu() { + add_management_page( + 'HAL Import', + 'HAL Import', + 'edit_others_posts', + 'thalim-hal-importer', + [$this, 'render_admin_page'] + ); + } + + public function render_admin_page() { + $admin_page = new Thalim_HAL_Admin_Page(); + $admin_page->render(); + } +} + +// Activation hook +register_activation_hook(__FILE__, function() { + add_option('thalim_hal_version', THALIM_HAL_VERSION); +}); + +// Deactivation hook +register_deactivation_hook(__FILE__, function() { + delete_transient('thalim_hal_preview_data'); +}); + +// Initialize plugin +add_action('plugins_loaded', function() { + Thalim_HAL_Importer::get_instance(); +});