extract_html.py

import os
import requests
from bs4 import BeautifulSoup

def ensure_url_scheme(url):
    if not url.startswith(('http://', 'https://')):
        return 'https://' + url.lstrip('/')
    return url

def get_publications_from_hal(author_id):
    # URL pour l'API de HAL
    url = f"https://api.archives-ouvertes.fr/search/?q=authIdHal_s:{author_id}&wt=json&fl=title_s,authFullName_s,producedDateY_i,uri_s,conferenceTitle_s,journalTitle_s,doiId_s,docType_s"

    # Envoyer une requête GET à l'API de HAL
    response = requests.get(url)

    # Vérifier si la requête a réussi
    if response.status_code == 200:
        data = response.json()
        publications = data['response']['docs']
        return publications
    else:
        print(f"Erreur : Impossible de récupérer les données de l'API de HAL. Code de statut : {response.status_code}")
        return None

def download_image(image_url, image_path):
    image_url = ensure_url_scheme(image_url)
    response = requests.get(image_url)
    if response.status_code == 200:
        with open(image_path, 'wb') as f:
            f.write(response.content)
        return True
    else:
        return False

def create_html(publications):
    # Créer le dossier images s'il n'existe pas
    if not os.path.exists('images'):
        os.makedirs('images')

    # Trier les publications du plus récent au plus vieux
    publications.sort(key=lambda x: x.get('producedDateY_i', 0), reverse=True)

    html_content = """
    <!DOCTYPE html>
    <html lang="fr">
    <head>
        <meta charset="UTF-8">
        <title>Publications</title>
        <style>
            ul {
                list-style-type: disc;
                padding-left: 20px;
            }
            .year-section {
                margin-top: 20px;
            }
            .publication {
                margin-bottom: 10px;
            }
            img {
                max-width: 100px;
                height: auto;
                margin-right: 10px;
            }
            p {
                margin: 0;
            }
        </style>
    </head>
    <body>
        <h1>Liste des Publications</h1>
    """

    current_year = None

    for pub in publications:
        title_list = pub.get('title_s', [])
        title = title_list[0] if title_list else ''
        authors = ', '.join(pub.get('authFullName_s', []))
        year = pub.get('producedDateY_i', '')
        uri = pub.get('uri_s', '')
        conference = pub.get('conferenceTitle_s', '')
        journal = pub.get('journalTitle_s', '')
        doi = pub.get('doiId_s', '')
        doc_type = pub.get('docType_s', '')

        # Télécharger l'image de la publication si disponible
        image_path = ''
        if uri:
            soup = BeautifulSoup(requests.get(uri).content, 'html.parser')
            img_tag = soup.find('img')
            if img_tag and 'src' in img_tag.attrs:
                image_url = img_tag['src']
                # Remplacer les caractères spéciaux et les espaces dans le titre
                safe_title = title.replace(' ', '_').replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')
                image_path = f"web/content/research/images/{safe_title}.jpg"
                download_image(image_url, image_path)

        # Ajouter une section pour chaque année
        if year != current_year:
            if current_year is not None:
                html_content += "</ul></div>"
            current_year = year
            html_content += f"<div class='year-section'><h2>{year}</h2><ul>"

        # Construire la référence complète de la publication
        reference_parts = [title]
        if authors:
            reference_parts.append(authors)
        
        if doc_type == 'HDR':
            reference_parts.append("HDR, Université")
        
        if doc_type == 'Rapport de recherche':
            reference_parts.append("Rapport de recherche")
        
        if doc_type == 'Logiciel':
            reference_parts.append("Code informatique")
        
        if conference or journal:
            reference_parts.append(f"{conference}, {journal}, {year}")
        
        reference_text = '<br>'.join(part for part in reference_parts if part)

        # Ajouter la publication à la section de l'année correspondante
        html_content += f"""
            <li class="publication">
                <p><strong>{title}</strong></p>
                <p>{authors}</p>
                <p>{conference or journal}, {year}</p>
                <p><a href="{uri}">Lien vers HAL</a> {f'<a href="https://doi.org/{doi}">, Lien DOI</a>' if doi else ''}</p>
            </li>
        """

    html_content += """
        </ul></div>
    </body>
    </html>
    """

    with open('web/content/research/publications.html', 'w', encoding='utf-8') as f:
        f.write(html_content)

# Exemple d'utilisation
author_id = "alexandre-meyer"
publications = get_publications_from_hal(author_id)

if publications:
    create_html(publications)