Initial commit

This commit is contained in:
2026-05-12 23:33:56 +02:00
commit 57719052f3
6 changed files with 1611 additions and 0 deletions

523
includes/class-importer.php Normal file
View File

@@ -0,0 +1,523 @@
<?php
/**
* Importer Logic Class - Handles import from HAL to WordPress
*/
if (!defined('ABSPATH')) {
exit;
}
class Thalim_HAL_Importer_Logic {
// HAL doc type -> WordPress category ID
private const DOC_TYPE_MAP = [
'ART' => 16, // Article
'COUV' => 16, // Chapitre -> Articles
'OUV' => 15, // Ouvrage -> Ouvrages
'COMM' => 13, // Communication -> Communications
'ISSUE' => 16, // Direction de numéro -> Articles
'PROCEEDINGS' => 15, // Direction d'ouvrage/Proceedings -> Ouvrages
'THESE' => 14, // Thèse -> Soutenances
'HDR' => 14, // HDR -> Soutenances
'SON' => 19, // Son -> Captations audio/vidéo
'VIDEO' => 19, // Vidéo -> Captations audio/vidéo
];
// Doc types that use date_de_debut instead of datetime
private const EVENT_DOC_TYPES = ['COMM', 'THESE', 'HDR', 'SON', 'VIDEO'];
// Pods IDs — queried from the DB, stable per installation
private const POD_ID_POST = 8;
private const FIELD_ID_CATEGORIE = 16; // "Type d'annonce" (picks from WP category)
private const FIELD_ID_MEMBRES = 178;
private const FIELD_ID_AUTRE_MBRES = 195; // autre_membres (unused in import, for reference)
private const FIELD_ID_AXES = 270; // axes_thematiques (picks from axe_thematique)
private const FIELD_ID_PROGRAMMES = 271; // programmes_de_recherche (picks from programme_de_recherche)
private const FIELD_ID_ETIQUETTES = 652; // étiquettes (picks from post_tag)
/** Source of the axes applied on the last import(): 'spip' | 'coauthors' | 'owner' | 'none'. */
public $last_axes_source = 'none';
/**
* Check if publication is already imported
*/
public function is_imported($hal_id) {
if (empty($hal_id)) return false;
global $wpdb;
return (int) $wpdb->get_var($wpdb->prepare(
"SELECT COUNT(*) FROM {$wpdb->postmeta} WHERE meta_key = 'hal_id' AND meta_value = %s",
$hal_id
)) > 0;
}
/**
* Get category ID for HAL doc type
*/
public function get_category_id($doc_type) {
return self::DOC_TYPE_MAP[$doc_type] ?? null;
}
/**
* Get doc type mappings
*/
public function get_doc_type_map() {
return self::DOC_TYPE_MAP;
}
/**
* Import a HAL publication as a WordPress post.
*
* @param array $hal_doc Raw HAL API document.
* @param array $wp_users_by_hal_id Map of normalized_hal_id => ['id' => int, 'name' => string].
* @param string $post_status Target post_status (default 'pending').
* @param bool $backdate_post Use producedDate_s as post_date (default false).
* @param array $spip_context SPIP-derived context for bulk imports:
* ['axes' => int[], 'tags' => int[], 'programmes' => int[], 'owner_user_id' => ?int]
* @return int|WP_Error New post ID on success, WP_Error on failure.
* The axes source is stored in $this->last_axes_source for caller reporting.
*/
public function import(
array $hal_doc,
array $wp_users_by_hal_id = [],
string $post_status = 'pending',
bool $backdate_post = false,
array $spip_context = []
) {
$hal_id = $hal_doc['halId_s'] ?? '';
$doc_type = $hal_doc['docType_s'] ?? '';
if (empty($hal_id)) return new WP_Error('no_id', 'Missing HAL ID');
if ($this->is_imported($hal_id)) return new WP_Error('exists', 'Already imported: ' . $hal_id);
// --- Resolve post author from HAL author IDs ---
$author_hal_ids = $hal_doc['authIdHal_s'] ?? [];
$author_names = $hal_doc['authFullName_s'] ?? [];
$matched_user_ids = [];
$matched_user_names = [];
foreach ($author_hal_ids as $hal_author_id) {
$normalized = strtolower(trim($hal_author_id));
if (isset($wp_users_by_hal_id[$normalized])) {
$user = $wp_users_by_hal_id[$normalized];
$matched_user_ids[] = $user['id'];
$matched_user_names[] = $user['name'];
}
}
$post_author = !empty($matched_user_ids) ? $matched_user_ids[0] : 1;
// --- Create the post ---
$post_args = [
'post_title' => wp_strip_all_tags($hal_doc['title_s'][0] ?? ''),
'post_content' => wp_kses_post($hal_doc['abstract_s'][0] ?? ''),
'post_status' => $post_status,
'post_type' => 'post',
'post_author' => $post_author,
];
// Backdate post_date to HAL producedDate_s when requested (for legacy bulk imports)
if ($backdate_post) {
$backdate_ymd = $this->parse_hal_date($hal_doc['producedDate_s'] ?? '');
if ($backdate_ymd) {
$post_args['post_date'] = $backdate_ymd . ' 12:00:00';
$post_args['post_date_gmt'] = $backdate_ymd . ' 12:00:00';
}
}
$post_id = wp_insert_post($post_args, true);
if (is_wp_error($post_id)) return $post_id;
// --- Category — Pods triple-storage pattern ---
$cat_id = self::DOC_TYPE_MAP[$doc_type] ?? null;
if ($cat_id) {
global $wpdb;
// 1. Native WP category assignment
wp_set_post_categories($post_id, [$cat_id]);
// 2. Pods postmeta: single integer value
update_post_meta($post_id, 'categorie', $cat_id);
// 3. Pods _pods_ meta: serialized array of one integer
update_post_meta($post_id, '_pods_categorie', [$cat_id]);
// 4. wp_podsrel row
$wpdb->insert(
$wpdb->prefix . 'podsrel',
[
'pod_id' => self::POD_ID_POST,
'field_id' => self::FIELD_ID_CATEGORIE,
'item_id' => $post_id,
'related_pod_id' => 0,
'related_field_id'=> 0,
'related_item_id' => $cat_id,
'weight' => 0,
],
['%d', '%d', '%d', '%d', '%d', '%d', '%d']
);
}
// --- Core meta ---
update_post_meta($post_id, 'hal_id', $hal_id);
update_post_meta($post_id, 'hal_url', $hal_doc['uri_s'] ?? '');
// HAL PDF file -> lien_externe_1
$file_url = $hal_doc['fileMain_s'] ?? '';
if ($file_url) {
update_post_meta($post_id, 'lien_externe_1', $file_url);
update_post_meta($post_id, 'titre_du_lien_externe_1', 'Document HAL // HAL Document');
}
// Journal (ART)
$journal = $hal_doc['journalTitle_s'] ?? '';
if ($journal) {
update_post_meta($post_id, 'journal', $journal);
}
// Book title as sous-titre (COUV), only if different from post title
$book_title = $hal_doc['bookTitle_s'] ?? '';
$post_title = $hal_doc['title_s'][0] ?? '';
if ($book_title && $book_title !== $post_title) {
update_post_meta($post_id, 'sous-titre', $book_title);
}
// Publisher -> editeur (plain text, no Pods triple-storage needed)
$publisher = $hal_doc['publisher_s'] ?? '';
if (is_array($publisher)) $publisher = $publisher[0] ?? '';
if ($publisher) {
update_post_meta($post_id, 'editeur', $publisher);
}
// Fonction label: bilingual plain text (only relevant for cats 4, 15, 16)
if ($doc_type === 'COUV') {
update_post_meta($post_id, 'fonction_auteur', 'Auteur du chapitre // Chapter author');
} elseif ($doc_type === 'ISSUE') {
update_post_meta($post_id, 'fonction_auteur', 'Direction de numéro // Editor-in-Chief');
}
// --- Keywords -> étiquettes (Pods triple-storage, picks from post_tag) ---
$hal_keywords = $hal_doc['keyword_s'] ?? [];
if (!empty($hal_keywords)) {
$matched_term_ids = $this->match_keywords_to_tags($hal_keywords);
if (!empty($matched_term_ids)) {
global $wpdb;
// 1. Native WP term relationship
wp_set_object_terms($post_id, $matched_term_ids, 'post_tag', true);
// 2. Individual postmeta rows (one per term ID)
foreach ($matched_term_ids as $tid) {
add_post_meta($post_id, 'etiquettes', (string) $tid);
}
// 3. _pods_etiquettes: serialized array of term IDs as integers
update_post_meta($post_id, '_pods_etiquettes', array_map('intval', $matched_term_ids));
// 4. wp_podsrel rows
foreach ($matched_term_ids as $weight => $tid) {
$wpdb->insert(
$wpdb->prefix . 'podsrel',
[
'pod_id' => self::POD_ID_POST,
'field_id' => self::FIELD_ID_ETIQUETTES,
'item_id' => $post_id,
'related_pod_id' => 0,
'related_field_id' => 0,
'related_item_id' => (int) $tid,
'weight' => $weight,
],
['%d', '%d', '%d', '%d', '%d', '%d', '%d']
);
}
}
}
// --- Date meta ---
$date_raw = $hal_doc['producedDate_s'] ?? '';
// THESE/HDR: use defenseDate_s if available, fallback to producedDate_s
if (in_array($doc_type, ['THESE', 'HDR'])) {
$defense = $hal_doc['defenseDate_s'] ?? '';
if ($defense) $date_raw = $defense;
}
$date_meta = $this->parse_hal_date($date_raw);
if ($date_meta) {
$date_field = in_array($doc_type, self::EVENT_DOC_TYPES) ? 'date_de_debut' : 'datetime';
update_post_meta($post_id, $date_field, $date_meta);
}
// --- Type pick fields (pick custom-simple — no triple-storage) ---
$type_picks = [
'PROCEEDINGS' => ['type_colloque', 'Colloque'],
'THESE' => ['type_soutenance', 'Soutenance de thèse'],
'HDR' => ['type_soutenance', "Soutenance d'habilitation"],
'SON' => ['type_captation', 'Son'],
'VIDEO' => ['type_captation', 'Vidéo'],
];
if (isset($type_picks[$doc_type])) {
[$field, $value] = $type_picks[$doc_type];
update_post_meta($post_id, $field, $value);
}
// --- Lieu for PROCEEDINGS (city, country from HAL) ---
if ($doc_type === 'PROCEEDINGS') {
$city = $hal_doc['city_s'] ?? '';
$country = $hal_doc['country_s'] ?? '';
if (is_array($city)) $city = $city[0] ?? '';
if (is_array($country)) $country = $country[0] ?? '';
$lieu = trim("$city, $country", ', ');
if ($lieu) {
update_post_meta($post_id, 'lieu', $lieu);
}
}
// --- Conference title as sous-titre for PROCEEDINGS ---
if ($doc_type === 'PROCEEDINGS') {
$conf_title = $hal_doc['conferenceTitle_s'] ?? '';
if ($conf_title) {
update_post_meta($post_id, 'sous-titre', $conf_title);
}
}
// --- Reference bibliographique from citationFull_s (cats 4, 15, 16) ---
$citation = $hal_doc['citationFull_s'] ?? '';
if ($citation && in_array($cat_id, [4, 15, 16])) {
update_post_meta($post_id, 'reference_bibliographique', wp_kses_post($citation));
}
// --- Store matched THALIM members — Pods triple-storage pattern
if (!empty($matched_user_ids)) {
global $wpdb;
// 1. Individual postmeta rows (one per user ID, as string)
foreach ($matched_user_ids as $uid) {
add_post_meta($post_id, 'membres', (string) $uid);
}
// 2. _pods_ meta: serialized PHP array of user IDs as integers
update_post_meta($post_id, '_pods_membres', array_map('intval', $matched_user_ids));
// 3. wp_podsrel rows (one per user, weight = position)
foreach ($matched_user_ids as $weight => $uid) {
$wpdb->insert(
$wpdb->prefix . 'podsrel',
[
'pod_id' => self::POD_ID_POST,
'field_id' => self::FIELD_ID_MEMBRES,
'item_id' => $post_id,
'related_pod_id' => 0,
'related_field_id'=> 0,
'related_item_id' => (int) $uid,
'weight' => $weight,
],
['%d', '%d', '%d', '%d', '%d', '%d', '%d']
);
}
}
// --- Axes thématiques : cascade (SPIP direct > co-auteurs > owner) ---
$axes_resolution = $this->resolve_axes_cascade($matched_user_ids, $spip_context);
$this->last_axes_source = $axes_resolution['source'];
if (!empty($axes_resolution['term_ids'])) {
$this->set_pods_taxonomy_multi(
$post_id, 'axes_thematiques', self::FIELD_ID_AXES,
$axes_resolution['term_ids'], 'axe_thematique'
);
}
// --- Programmes de recherche : SPIP direct OR keyword matching ---
$prog_ids = !empty($spip_context['programmes'])
? array_map('intval', $spip_context['programmes'])
: $this->match_terms_by_keywords($hal_doc['keyword_s'] ?? [], 'programme_de_recherche');
if (!empty($prog_ids)) {
$this->set_pods_taxonomy_multi(
$post_id, 'programmes_de_recherche', self::FIELD_ID_PROGRAMMES,
$prog_ids, 'programme_de_recherche'
);
}
// --- Étiquettes SPIP directes (en plus du matching HAL déjà fait plus haut) ---
if (!empty($spip_context['tags'])) {
// Merge avec les tags déjà posés par le bloc étiquettes plus haut
$existing = wp_get_object_terms($post_id, 'post_tag', ['fields' => 'ids']);
$merged = array_values(array_unique(array_merge(
is_array($existing) ? array_map('intval', $existing) : [],
array_map('intval', $spip_context['tags'])
)));
$this->set_pods_taxonomy_multi(
$post_id, 'etiquettes', self::FIELD_ID_ETIQUETTES,
array_diff($merged, is_array($existing) ? $existing : []),
'post_tag'
);
}
// Unmatched authors as free text — remove matched names from the full list
$unmatched = array_filter($author_names, function($name) use ($matched_user_names) {
foreach ($matched_user_names as $matched) {
// Loose comparison: ignore case and extra spaces
if (mb_strtolower(trim($name)) === mb_strtolower(trim($matched))) {
return false;
}
}
return true;
});
if (!empty($unmatched)) {
update_post_meta($post_id, 'autrepersonnes', implode(', ', array_values($unmatched)));
}
// --- Polylang: assign French language ---
if (function_exists('pll_set_post_language')) {
pll_set_post_language($post_id, 'fr');
}
return $post_id;
}
/**
* Match HAL keyword strings against existing WordPress terms in a given taxonomy.
*
* WP terms are often stored bilingually as "Terme FR // English term".
* Matching is case-insensitive against both the FR and EN parts.
*
* @param string[] $hal_keywords Raw keyword strings from HAL keyword_s field.
* @param string $taxonomy WordPress taxonomy (e.g. 'post_tag', 'programme_de_recherche').
* @return int[] Matched term IDs.
*/
private function match_terms_by_keywords(array $hal_keywords, string $taxonomy = 'post_tag'): array {
if (empty($hal_keywords)) return [];
$terms = get_terms(['taxonomy' => $taxonomy, 'hide_empty' => false]);
if (is_wp_error($terms) || empty($terms)) return [];
// Normalise HAL keywords once for comparison
$hal_lower = array_map(fn($kw) => mb_strtolower(trim($kw)), $hal_keywords);
$matched = [];
foreach ($terms as $term) {
$parts = explode(' // ', $term->name, 2);
$fr = mb_strtolower(trim($parts[0]));
$en = isset($parts[1]) ? mb_strtolower(trim($parts[1])) : null;
if (in_array($fr, $hal_lower, true) ||
($en !== null && in_array($en, $hal_lower, true))) {
$matched[] = (int) $term->term_id;
}
}
return $matched;
}
/**
* Backwards-compatible alias for the renamed method.
*/
private function match_keywords_to_tags(array $hal_keywords): array {
return $this->match_terms_by_keywords($hal_keywords, 'post_tag');
}
/**
* Resolve axes thématiques through a cascade of strategies.
*
* 1. Direct SPIP links ($spip_context['axes'])
* 2. Axes from all matched WP co-authors (_pods_axes_thematiques)
* 3. Axe of the SPIP flux owner user ($spip_context['owner_user_id'])
*
* @return array{source: string, term_ids: int[]}
*/
private function resolve_axes_cascade(array $matched_user_ids, array $spip_context): array {
// 1. SPIP direct
if (!empty($spip_context['axes'])) {
$ids = array_values(array_unique(array_map('intval', $spip_context['axes'])));
if (!empty($ids)) return ['source' => 'spip', 'term_ids' => $ids];
}
// 2. Co-authors matched (any matched THALIM member with an axe)
$from_authors = [];
foreach ($matched_user_ids as $uid) {
$axes = get_user_meta((int) $uid, '_pods_axes_thematiques', true);
if (is_array($axes)) {
foreach ($axes as $tid) $from_authors[] = (int) $tid;
} elseif (is_string($axes) && $axes !== '') {
$unser = @unserialize($axes);
if (is_array($unser)) {
foreach ($unser as $tid) $from_authors[] = (int) $tid;
}
}
}
$from_authors = array_values(array_unique(array_filter($from_authors)));
if (!empty($from_authors)) {
return ['source' => 'coauthors', 'term_ids' => $from_authors];
}
// 3. SPIP flux owner user
if (!empty($spip_context['owner_user_id'])) {
$axes = get_user_meta((int) $spip_context['owner_user_id'], '_pods_axes_thematiques', true);
$ids = [];
if (is_array($axes)) {
$ids = array_map('intval', $axes);
} elseif (is_string($axes) && $axes !== '') {
$unser = @unserialize($axes);
if (is_array($unser)) $ids = array_map('intval', $unser);
}
$ids = array_values(array_unique(array_filter($ids)));
if (!empty($ids)) return ['source' => 'owner', 'term_ids' => $ids];
}
return ['source' => 'none', 'term_ids' => []];
}
/**
* Parse a HAL date (YYYY, YYYY-MM, YYYY-MM-DD, or ISO datetime) to Y-m-d.
* Returns '' on failure. HAL often emits partial dates that strtotime
* mishandles (e.g. strtotime("2022") interprets 2022 as a time, not a year).
*/
private function parse_hal_date(string $raw): string {
$raw = trim($raw);
if ($raw === '') return '';
if (preg_match('/^(\d{4})-(\d{2})-(\d{2})/', $raw, $m)) {
return "{$m[1]}-{$m[2]}-{$m[3]}";
}
if (preg_match('/^(\d{4})-(\d{2})$/', $raw, $m)) {
return "{$m[1]}-{$m[2]}-01";
}
if (preg_match('/^(\d{4})$/', $raw, $m)) {
return "{$m[1]}-01-01";
}
$ts = strtotime($raw);
return $ts ? date('Y-m-d', $ts) : '';
}
/**
* Generic Pods triple-storage writer for multi-value taxonomy fields.
* Writes to: wp_term_relationships, postmeta rows, _pods_ meta, wp_podsrel.
*/
private function set_pods_taxonomy_multi(int $post_id, string $field_name, int $field_id, array $term_ids, string $taxonomy): void {
if (empty($term_ids)) return;
global $wpdb;
$term_ids = array_values(array_unique(array_map('intval', $term_ids)));
// 1. wp_term_relationships
wp_set_object_terms($post_id, $term_ids, $taxonomy, true);
// 2. postmeta (one row per term ID, as string)
foreach ($term_ids as $tid) {
add_post_meta($post_id, $field_name, (string) $tid);
}
// 3. _pods_ meta: serialized array of ints
update_post_meta($post_id, '_pods_' . $field_name, $term_ids);
// 4. wp_podsrel rows (weight = position)
foreach ($term_ids as $weight => $tid) {
$wpdb->insert(
$wpdb->prefix . 'podsrel',
[
'pod_id' => self::POD_ID_POST,
'field_id' => $field_id,
'item_id' => $post_id,
'related_pod_id' => 0,
'related_field_id' => 0,
'related_item_id' => (int) $tid,
'weight' => $weight,
],
['%d', '%d', '%d', '%d', '%d', '%d', '%d']
);
}
}
}