Files
thalim-plugin-hal-importer/includes/class-importer.php

425 lines
18 KiB
PHP

<?php
/**
* Importer Logic Class - Handles import from HAL to WordPress
*/
if (!defined('ABSPATH')) {
exit;
}
class Thalim_HAL_Importer_Logic {
// HAL doc type -> slug de catégorie WP (résolu en term_id au runtime —
// les IDs auto-incrémentés ne survivent pas à une réimportation de base)
private const DOC_TYPE_SLUGS = [
'ART' => 'articles', // Article
'COUV' => 'articles', // Chapitre -> Articles
'OUV' => 'ouvrages', // Ouvrage -> Ouvrages
'COMM' => 'communications', // Communication -> Communications
'ISSUE' => 'articles', // Direction de numéro -> Articles
'PROCEEDINGS' => 'ouvrages', // Direction d'ouvrage/Proceedings -> Ouvrages
'THESE' => 'soutenances', // Thèse -> Soutenances
'HDR' => 'soutenances', // HDR -> Soutenances
'SON' => 'medias', // Son -> Médias
'VIDEO' => 'medias', // Vidéo -> Médias
'NOTICE' => 'articles', // Notice/recension -> Articles
'BLOG' => 'medias', // Blog/tribune -> Médias
'TRAD' => 'ouvrages', // Traduction -> Ouvrages (fonction auteur "Traduction")
'REPORT' => 'publications-et-productions', // Rapport -> Publications et productions
'UNDEFINED' => 'publications-et-productions', // Non défini -> Publications et productions
'POSTER' => 'publications-et-productions', // Poster -> Publications et productions
'OTHER' => 'publications-et-productions', // Autre -> Publications et productions
];
// Doc types that use date_de_debut instead of datetime
private const EVENT_DOC_TYPES = ['COMM', 'THESE', 'HDR', 'SON', 'VIDEO'];
/**
* Résout un slug de catégorie en term_id (cache statique par requête).
*/
private function cat_id_by_slug(string $slug): ?int {
static $cache = [];
if (!array_key_exists($slug, $cache)) {
$term = get_term_by('slug', $slug, 'category');
$cache[$slug] = $term ? (int) $term->term_id : null;
}
return $cache[$slug];
}
/** Source of the axes applied on the last import(): 'spip' | 'coauthors' | 'owner' | 'none'. */
public $last_axes_source = 'none';
/**
* Check if publication is already imported
*/
public function is_imported($hal_id) {
if (empty($hal_id)) return false;
global $wpdb;
return (int) $wpdb->get_var($wpdb->prepare(
"SELECT COUNT(*) FROM {$wpdb->postmeta} WHERE meta_key = 'hal_id' AND meta_value = %s",
$hal_id
)) > 0;
}
/**
* Return ['id' => int, 'status' => string] for the post matching this hal_id, or null.
*/
public function get_imported_post($hal_id) {
if (empty($hal_id)) return null;
global $wpdb;
$row = $wpdb->get_row($wpdb->prepare(
"SELECT p.ID, p.post_status FROM {$wpdb->posts} p
INNER JOIN {$wpdb->postmeta} pm ON pm.post_id = p.ID
WHERE pm.meta_key = 'hal_id' AND pm.meta_value = %s
LIMIT 1",
$hal_id
));
return $row ? ['id' => (int) $row->ID, 'status' => $row->post_status] : null;
}
/**
* Get category ID for HAL doc type
*/
public function get_category_id($doc_type) {
$slug = self::DOC_TYPE_SLUGS[$doc_type] ?? null;
return $slug ? $this->cat_id_by_slug($slug) : null;
}
/**
* Get doc type mappings (doc type => category term_id)
*/
public function get_doc_type_map() {
$map = [];
foreach (self::DOC_TYPE_SLUGS as $type => $slug) {
$map[$type] = $this->cat_id_by_slug($slug);
}
return $map;
}
/**
* Import a HAL publication as a WordPress post.
*
* @param array $hal_doc Raw HAL API document.
* @param array $wp_users_by_hal_id Map of normalized_hal_id => ['id' => int, 'name' => string].
* @param string $post_status Target post_status (default 'pending').
* @param bool $backdate_post Use producedDate_s as post_date (default false).
* @param array $spip_context SPIP-derived context for bulk imports:
* ['axes' => int[], 'tags' => int[], 'programmes' => int[], 'owner_user_id' => ?int]
* @return int|WP_Error New post ID on success, WP_Error on failure.
* The axes source is stored in $this->last_axes_source for caller reporting.
*/
public function import(
array $hal_doc,
array $wp_users_by_hal_id = [],
string $post_status = 'pending',
bool $backdate_post = false,
array $spip_context = [],
?int $force_post_author = null
) {
$hal_id = $hal_doc['halId_s'] ?? '';
$doc_type = $hal_doc['docType_s'] ?? '';
if (empty($hal_id)) return new WP_Error('no_id', 'Missing HAL ID');
if ($this->is_imported($hal_id)) return new WP_Error('exists', 'Already imported: ' . $hal_id);
// --- Resolve post author from HAL author IDs ---
$author_hal_ids = $hal_doc['authIdHal_s'] ?? [];
$author_names = $hal_doc['authFullName_s'] ?? [];
$matched_user_ids = [];
$matched_user_names = [];
foreach ($author_hal_ids as $hal_author_id) {
$normalized = strtolower(trim($hal_author_id));
if (isset($wp_users_by_hal_id[$normalized])) {
$user = $wp_users_by_hal_id[$normalized];
$matched_user_ids[] = $user['id'];
$matched_user_names[] = $user['name'];
}
}
$post_author = $force_post_author
?? (!empty($matched_user_ids) ? $matched_user_ids[0] : 1);
// --- Create the post ---
$post_args = [
'post_title' => wp_strip_all_tags($hal_doc['title_s'][0] ?? ''),
'post_content' => wp_kses_post($hal_doc['abstract_s'][0] ?? ''),
'post_status' => $post_status,
'post_type' => 'post',
'post_author' => $post_author,
];
// Backdate post_date to HAL producedDate_s when requested (for legacy bulk imports)
if ($backdate_post) {
$backdate_ymd = $this->parse_hal_date($hal_doc['producedDate_s'] ?? '');
if ($backdate_ymd) {
$post_args['post_date'] = $backdate_ymd . ' 12:00:00';
$post_args['post_date_gmt'] = $backdate_ymd . ' 12:00:00';
}
}
$post_id = wp_insert_post($post_args, true);
if (is_wp_error($post_id)) return $post_id;
// --- Category — stockage Pods centralisé (cf. class-pods-storage.php) ---
$cat_id = $this->get_category_id($doc_type);
if ($cat_id) {
Thalim_HAL_Pods_Storage::set_categorie($post_id, $cat_id);
}
// --- Core meta ---
update_post_meta($post_id, 'hal_id', $hal_id);
update_post_meta($post_id, 'hal_url', $hal_doc['uri_s'] ?? '');
// HAL PDF file -> lien_externe_1
$file_url = $hal_doc['fileMain_s'] ?? '';
if ($file_url) {
update_post_meta($post_id, 'lien_externe_1', $file_url);
update_post_meta($post_id, 'titre_du_lien_externe_1', 'Document HAL // HAL Document');
}
// Journal (ART)
$journal = $hal_doc['journalTitle_s'] ?? '';
if ($journal) {
update_post_meta($post_id, 'journal', $journal);
}
// Book title as sous-titre (COUV), only if different from post title
$book_title = $hal_doc['bookTitle_s'] ?? '';
$post_title = $hal_doc['title_s'][0] ?? '';
if ($book_title && $book_title !== $post_title) {
update_post_meta($post_id, 'sous-titre', $book_title);
}
// Publisher -> editeur (plain text, no Pods triple-storage needed)
$publisher = $hal_doc['publisher_s'] ?? '';
if (is_array($publisher)) $publisher = $publisher[0] ?? '';
if ($publisher) {
update_post_meta($post_id, 'editeur', $publisher);
}
// Fonction label: bilingual plain text (only relevant for cats 4, 15, 16)
if ($doc_type === 'COUV') {
update_post_meta($post_id, 'fonction_auteur', 'Auteur du chapitre // Chapter author');
} elseif ($doc_type === 'ISSUE') {
update_post_meta($post_id, 'fonction_auteur', 'Direction de numéro // Editor-in-Chief');
} elseif ($doc_type === 'TRAD') {
update_post_meta($post_id, 'fonction_auteur', 'Traduction // Translation');
}
// --- Keywords HAL + tags SPIP -> étiquettes (une seule écriture Pods) ---
$etiquette_ids = $this->match_keywords_to_tags($hal_doc['keyword_s'] ?? []);
if (!empty($spip_context['tags'])) {
$etiquette_ids = array_merge($etiquette_ids, array_map('intval', $spip_context['tags']));
}
if (!empty($etiquette_ids)) {
Thalim_HAL_Pods_Storage::set_relation($post_id, 'etiquettes', $etiquette_ids, 'post_tag');
}
// --- Date meta ---
$date_raw = $hal_doc['producedDate_s'] ?? '';
// THESE/HDR: use defenseDate_s if available, fallback to producedDate_s
if (in_array($doc_type, ['THESE', 'HDR'])) {
$defense = $hal_doc['defenseDate_s'] ?? '';
if ($defense) $date_raw = $defense;
}
$date_meta = $this->parse_hal_date($date_raw);
if ($date_meta) {
$date_field = in_array($doc_type, self::EVENT_DOC_TYPES) ? 'date_de_debut' : 'datetime';
update_post_meta($post_id, $date_field, $date_meta);
}
// --- Type pick fields (pick custom-simple — no triple-storage) ---
$type_picks = [
'PROCEEDINGS' => ['type_colloque', 'Colloque'],
'THESE' => ['type_soutenance', 'Soutenance de thèse'],
'HDR' => ['type_soutenance', "Soutenance d'habilitation"],
'SON' => ['type_captation', 'Son'],
'VIDEO' => ['type_captation', 'Vidéo'],
];
if (isset($type_picks[$doc_type])) {
[$field, $value] = $type_picks[$doc_type];
update_post_meta($post_id, $field, $value);
}
// --- Lieu for PROCEEDINGS (city, country from HAL) ---
if ($doc_type === 'PROCEEDINGS') {
$city = $hal_doc['city_s'] ?? '';
$country = $hal_doc['country_s'] ?? '';
if (is_array($city)) $city = $city[0] ?? '';
if (is_array($country)) $country = $country[0] ?? '';
$lieu = trim("$city, $country", ', ');
if ($lieu) {
update_post_meta($post_id, 'lieu', $lieu);
}
}
// --- Conference title as sous-titre for PROCEEDINGS ---
if ($doc_type === 'PROCEEDINGS') {
$conf_title = $hal_doc['conferenceTitle_s'] ?? '';
if ($conf_title) {
update_post_meta($post_id, 'sous-titre', $conf_title);
}
}
// --- Reference bibliographique from citationFull_s (publications/ouvrages/articles) ---
$citation_cats = array_filter([
$this->cat_id_by_slug('publications-et-productions'),
$this->cat_id_by_slug('ouvrages'),
$this->cat_id_by_slug('articles'),
]);
$citation = $hal_doc['citationFull_s'] ?? '';
if ($citation && in_array($cat_id, $citation_cats, true)) {
update_post_meta($post_id, 'reference_bibliographique', wp_kses_post($citation));
}
// --- Store matched THALIM members ---
if (!empty($matched_user_ids)) {
Thalim_HAL_Pods_Storage::set_relation($post_id, 'membres', $matched_user_ids, null);
}
// --- Axes thématiques : cascade (SPIP direct > co-auteurs > owner) ---
$axes_resolution = $this->resolve_axes_cascade($matched_user_ids, $spip_context);
$this->last_axes_source = $axes_resolution['source'];
if (!empty($axes_resolution['term_ids'])) {
Thalim_HAL_Pods_Storage::set_relation($post_id, 'axes_thematiques', $axes_resolution['term_ids'], 'axe_thematique');
}
// --- Programmes de recherche : SPIP direct OR keyword matching ---
$prog_ids = !empty($spip_context['programmes'])
? array_map('intval', $spip_context['programmes'])
: $this->match_terms_by_keywords($hal_doc['keyword_s'] ?? [], 'programme_de_recherche');
if (!empty($prog_ids)) {
Thalim_HAL_Pods_Storage::set_relation($post_id, 'programmes_de_recherche', $prog_ids, 'programme_de_recherche');
}
// Unmatched authors as free text — remove matched names from the full list
$unmatched = array_filter($author_names, function($name) use ($matched_user_names) {
foreach ($matched_user_names as $matched) {
// Loose comparison: ignore case and extra spaces
if (mb_strtolower(trim($name)) === mb_strtolower(trim($matched))) {
return false;
}
}
return true;
});
if (!empty($unmatched)) {
update_post_meta($post_id, 'autrepersonnes', implode(', ', array_values($unmatched)));
}
return $post_id;
}
/**
* Match HAL keyword strings against existing WordPress terms in a given taxonomy.
*
* WP terms are often stored bilingually as "Terme FR // English term".
* Matching is case-insensitive against both the FR and EN parts.
*
* @param string[] $hal_keywords Raw keyword strings from HAL keyword_s field.
* @param string $taxonomy WordPress taxonomy (e.g. 'post_tag', 'programme_de_recherche').
* @return int[] Matched term IDs.
*/
private function match_terms_by_keywords(array $hal_keywords, string $taxonomy = 'post_tag'): array {
if (empty($hal_keywords)) return [];
$terms = get_terms(['taxonomy' => $taxonomy, 'hide_empty' => false]);
if (is_wp_error($terms) || empty($terms)) return [];
// Normalise HAL keywords once for comparison
$hal_lower = array_map(fn($kw) => mb_strtolower(trim($kw)), $hal_keywords);
$matched = [];
foreach ($terms as $term) {
$parts = explode(' // ', $term->name, 2);
$fr = mb_strtolower(trim($parts[0]));
$en = isset($parts[1]) ? mb_strtolower(trim($parts[1])) : null;
if (in_array($fr, $hal_lower, true) ||
($en !== null && in_array($en, $hal_lower, true))) {
$matched[] = (int) $term->term_id;
}
}
return $matched;
}
/**
* Backwards-compatible alias for the renamed method.
*/
private function match_keywords_to_tags(array $hal_keywords): array {
return $this->match_terms_by_keywords($hal_keywords, 'post_tag');
}
/**
* Resolve axes thématiques through a cascade of strategies.
*
* 1. Direct SPIP links ($spip_context['axes'])
* 2. Axes from all matched WP co-authors (_pods_axes_thematiques)
* 3. Axe of the SPIP flux owner user ($spip_context['owner_user_id'])
*
* @return array{source: string, term_ids: int[]}
*/
private function resolve_axes_cascade(array $matched_user_ids, array $spip_context): array {
// 1. SPIP direct
if (!empty($spip_context['axes'])) {
$ids = array_values(array_unique(array_map('intval', $spip_context['axes'])));
if (!empty($ids)) return ['source' => 'spip', 'term_ids' => $ids];
}
// 2. Co-authors matched (any matched THALIM member with an axe)
$from_authors = [];
foreach ($matched_user_ids as $uid) {
$axes = get_user_meta((int) $uid, '_pods_axes_thematiques', true);
if (is_array($axes)) {
foreach ($axes as $tid) $from_authors[] = (int) $tid;
} elseif (is_string($axes) && $axes !== '') {
$unser = @unserialize($axes);
if (is_array($unser)) {
foreach ($unser as $tid) $from_authors[] = (int) $tid;
}
}
}
$from_authors = array_values(array_unique(array_filter($from_authors)));
if (!empty($from_authors)) {
return ['source' => 'coauthors', 'term_ids' => $from_authors];
}
// 3. SPIP flux owner user
if (!empty($spip_context['owner_user_id'])) {
$axes = get_user_meta((int) $spip_context['owner_user_id'], '_pods_axes_thematiques', true);
$ids = [];
if (is_array($axes)) {
$ids = array_map('intval', $axes);
} elseif (is_string($axes) && $axes !== '') {
$unser = @unserialize($axes);
if (is_array($unser)) $ids = array_map('intval', $unser);
}
$ids = array_values(array_unique(array_filter($ids)));
if (!empty($ids)) return ['source' => 'owner', 'term_ids' => $ids];
}
return ['source' => 'none', 'term_ids' => []];
}
/**
* Parse a HAL date (YYYY, YYYY-MM, YYYY-MM-DD, or ISO datetime) to Y-m-d.
* Returns '' on failure. HAL often emits partial dates that strtotime
* mishandles (e.g. strtotime("2022") interprets 2022 as a time, not a year).
*/
private function parse_hal_date(string $raw): string {
$raw = trim($raw);
if ($raw === '') return '';
if (preg_match('/^(\d{4})-(\d{2})-(\d{2})/', $raw, $m)) {
return "{$m[1]}-{$m[2]}-{$m[3]}";
}
if (preg_match('/^(\d{4})-(\d{2})$/', $raw, $m)) {
return "{$m[1]}-{$m[2]}-01";
}
if (preg_match('/^(\d{4})$/', $raw, $m)) {
return "{$m[1]}-01-01";
}
$ts = strtotime($raw);
return $ts ? date('Y-m-d', $ts) : '';
}
}