extract_html.py

import os
import requests
from bs4 import BeautifulSoup

def ensure_url_scheme(url):
    if not url.startswith(('http://', 'https://')):
        return 'https://' + url.lstrip('/')
    return url

def get_publications_from_hal(author_id):
    # URL pour l'API de HAL
    #url = f"https://api.archives-ouvertes.fr/search/?q=authIdHal_s:{author_id}&wt=json&fl=title_s,authFullName_s,producedDateY_i,uri_s,conferenceTitle_s,journalTitle_s,doiId_s,docType_s"
    url = f"https://api.archives-ouvertes.fr/search/?q=authIdHal_s:alexandre-meyer&wt=json&fl=title_s,authFullName_s,producedDateY_i,uri_s,conferenceTitle_s,journalTitle_s,doiId_s,docType_s,software_s,&start=0&rows=50"
    # Envoyer une requête GET à l'API de HAL
    response = requests.get(url)

    # Vérifier si la requête a réussi
    if response.status_code == 200:
        data = response.json()
        publications = data['response']['docs']
        return publications
    else:
        print(f"Erreur : Impossible de récupérer les données de l'API de HAL. Code de statut : {response.status_code}")
        return None

def download_image(image_url, image_path):
    image_url = ensure_url_scheme(image_url)
    response = requests.get(image_url)
    if response.status_code == 200:
        with open(image_path, 'wb') as f:
            f.write(response.content)
        return True
    else:
        return False

def create_html(publications):
    # Créer le dossier images s'il n'existe pas
    if not os.path.exists('images'):
        os.makedirs('images')

    # Trier les publications du plus récent au plus vieux
    publications.sort(key=lambda x: x.get('producedDateY_i', 0), reverse=True)

    html_content = """
    <!DOCTYPE html>
    <html lang="fr">
    <head>
        <meta charset="UTF-8">
        <title>Publications</title>
        <style>
            ul {
                list-style-type: disc;
                padding-left: 20px;
            }
            .custom-year-section {
                margin-top: 20px;
            }
            .custom-publication {
                margin-bottom: 0px;
                margin: 0;
            }
            img {
                max-width: 100px;
                height: auto;
                margin-right: 10px;
            }
            custom-p {
                margin: 0;
            }
        </style>
    </head>
    <body>
        <h1>Publications List</h1>
    """

    current_year = None

    for pub in publications:
        title_list = pub.get('title_s', [])
        title = title_list[0] if title_list else ''
        authors = ', '.join(pub.get('authFullName_s', []))
        year = pub.get('producedDateY_i', '')
        uri = pub.get('uri_s', '')
        conference = pub.get('conferenceTitle_s', '')
        journal = pub.get('journalTitle_s', '')
        thesis = pub.get('thesisDegreeGrantor_s', '')
        doi = pub.get('doiId_s', '')
        doc_type = pub.get('docType_s', '')

        # Télécharger l'image de la publication si disponible
        image_path = ''
        if uri:
            #print(requests.get(uri).content)
            soup = BeautifulSoup(requests.get(uri).content, 'html.parser')
            img_tag = soup.find('img')
            if img_tag and 'src' in img_tag.attrs:
                image_url = img_tag['src']
                # Remplacer les caractères spéciaux et les espaces dans le titre
                safe_title = title.replace(' ', '_').replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')
                image_path = f"web/content/research/images/{safe_title}.jpg"
                download_image(image_url, image_path)

        # Ajouter une section pour chaque année
        if year != current_year:
            if current_year is not None:
                html_content += "</ul></div>"
            current_year = year
            html_content += f"<div class='custom-year-section'><h2>{year}</h2><ul>"

        # # Construire la référence complète de la publication
        # reference_parts = [title]
        # if authors:
        #     reference_parts.append(authors)        
        # if doc_type == 'HDR':
        #     reference_parts.append("HDR")
        # if doc_type == 'REPORT':
        #     reference_parts.append("Research Report")
        # if doc_type == 'THESE':
        #     reference_parts.append("Thesis")
        # if doc_type == 'UNDEFINED':
        #     reference_parts.append("Code")
        # print(doc_type)
        # if conference or journal:
        #     reference_parts.append(f"{conference}, {journal}, {year}")
        # reference_text = '<br>'.join(part for part in reference_parts if part)


        # Construire la référence complète de la publication
        reference_parts = [title]
        if authors:
            reference_parts.append(authors)
        
        if doc_type == 'HDR':
            pubtype = "HDR"
        elif doc_type == 'REPORT':
            pubtype = "Research Report"
        elif doc_type == 'THESE':
            pubtype = "Thesis"
        elif doc_type == 'UNDEFINED':
            pubtype = "Pre-publication"
            print(doc_type + " title=" + title)
        elif doc_type == 'COUV':
            pubtype = "Book chapter"
        elif doc_type == 'SOFTWARE':
            pubtype = "Software"
        elif conference or journal:
            pubtype = conference if conference else journal
        else:
            pubtype = "Other"
            print(doc_type + " title=" + title)

        # Ajouter la publication à la section de l'année correspondante
        html_content += f"""
            <li class="custom-publication">
                <custom-p><strong>{title}</strong></custom-p>
                <custom-p>{authors}</custom-p>
                <custom-p>{pubtype}, {year}</custom-p>
                <custom-p><a href="{uri}">Lien vers HAL</a> {f'<a href="https://doi.org/{doi}">, Lien DOI</a>' if doi else ''}</custom-p>
            </li>
        """

    html_content += """
        </ul></div>
    </body>
    </html>
    """

    with open('web/content/research/publications.html', 'w', encoding='utf-8') as f:
        f.write(html_content)

# Exemple d'utilisation
author_id = "alexandre-meyer"
publications = get_publications_from_hal(author_id)

if publications:
    create_html(publications)