Skip to content
Snippets Groups Projects
extract_html.py 6.24 KiB
Newer Older
Alexandre MEYER's avatar
Alexandre MEYER committed
import os
Alexandre MEYER's avatar
Alexandre MEYER committed
import requests
from bs4 import BeautifulSoup

Alexandre MEYER's avatar
Alexandre MEYER committed
def ensure_url_scheme(url):
    if not url.startswith(('http://', 'https://')):
        return 'https://' + url.lstrip('/')
    return url

Alexandre MEYER's avatar
Alexandre MEYER committed
def get_publications_from_hal(author_id):
    # URL pour l'API de HAL
Alexandre MEYER's avatar
Alexandre MEYER committed
    #url = f"https://api.archives-ouvertes.fr/search/?q=authIdHal_s:{author_id}&wt=json&fl=title_s,authFullName_s,producedDateY_i,uri_s,conferenceTitle_s,journalTitle_s,doiId_s,docType_s"
Alexandre MEYER's avatar
Alexandre MEYER committed
    url = f"https://api.archives-ouvertes.fr/search/?q=authIdHal_s:alexandre-meyer&wt=json&fl=title_s,authFullName_s,producedDateY_i,uri_s,conferenceTitle_s,journalTitle_s,doiId_s,docType_s,software_s,&start=0&rows=50"
Alexandre MEYER's avatar
Alexandre MEYER committed
    # Envoyer une requête GET à l'API de HAL
    response = requests.get(url)

    # Vérifier si la requête a réussi
    if response.status_code == 200:
        data = response.json()
        publications = data['response']['docs']
        return publications
    else:
        print(f"Erreur : Impossible de récupérer les données de l'API de HAL. Code de statut : {response.status_code}")
        return None

def download_image(image_url, image_path):
Alexandre MEYER's avatar
Alexandre MEYER committed
    image_url = ensure_url_scheme(image_url)
Alexandre MEYER's avatar
Alexandre MEYER committed
    response = requests.get(image_url)
    if response.status_code == 200:
        with open(image_path, 'wb') as f:
            f.write(response.content)
        return True
    else:
        return False

def create_html(publications):
Alexandre MEYER's avatar
Alexandre MEYER committed
    # Créer le dossier images s'il n'existe pas
    if not os.path.exists('images'):
        os.makedirs('images')

    # Trier les publications du plus récent au plus vieux
    publications.sort(key=lambda x: x.get('producedDateY_i', 0), reverse=True)

Alexandre MEYER's avatar
Alexandre MEYER committed
    html_content = """
    <!DOCTYPE html>
    <html lang="fr">
    <head>
        <meta charset="UTF-8">
        <title>Publications</title>
        <style>
Alexandre MEYER's avatar
Alexandre MEYER committed
            ul {
                list-style-type: disc;
                padding-left: 20px;
Alexandre MEYER's avatar
Alexandre MEYER committed
            }
Alexandre MEYER's avatar
Alexandre MEYER committed
            .custom-year-section {
Alexandre MEYER's avatar
Alexandre MEYER committed
                margin-top: 20px;
Alexandre MEYER's avatar
Alexandre MEYER committed
            }
Alexandre MEYER's avatar
Alexandre MEYER committed
            .custom-publication {
Alexandre MEYER's avatar
Alexandre MEYER committed
                margin-bottom: 0px;
Alexandre MEYER's avatar
Alexandre MEYER committed
                margin: 0;
Alexandre MEYER's avatar
Alexandre MEYER committed
            }
            img {
                max-width: 100px;
                height: auto;
Alexandre MEYER's avatar
Alexandre MEYER committed
                margin-right: 10px;
            }
Alexandre MEYER's avatar
Alexandre MEYER committed
            custom-p {
Alexandre MEYER's avatar
Alexandre MEYER committed
                margin: 0;
Alexandre MEYER's avatar
Alexandre MEYER committed
            }
        </style>
    </head>
    <body>
Alexandre MEYER's avatar
Alexandre MEYER committed
        <h1>Publications List</h1>
Alexandre MEYER's avatar
Alexandre MEYER committed
    """

Alexandre MEYER's avatar
Alexandre MEYER committed
    current_year = None

Alexandre MEYER's avatar
Alexandre MEYER committed
    for pub in publications:
Alexandre MEYER's avatar
Alexandre MEYER committed
        title_list = pub.get('title_s', [])
        title = title_list[0] if title_list else ''
Alexandre MEYER's avatar
Alexandre MEYER committed
        authors = ', '.join(pub.get('authFullName_s', []))
Alexandre MEYER's avatar
Alexandre MEYER committed
        year = pub.get('producedDateY_i', '')
Alexandre MEYER's avatar
Alexandre MEYER committed
        uri = pub.get('uri_s', '')
Alexandre MEYER's avatar
Alexandre MEYER committed
        conference = pub.get('conferenceTitle_s', '')
        journal = pub.get('journalTitle_s', '')
Alexandre MEYER's avatar
Alexandre MEYER committed
        thesis = pub.get('thesisDegreeGrantor_s', '')
Alexandre MEYER's avatar
Alexandre MEYER committed
        doi = pub.get('doiId_s', '')
        doc_type = pub.get('docType_s', '')
Alexandre MEYER's avatar
Alexandre MEYER committed

        # Télécharger l'image de la publication si disponible
        image_path = ''
        if uri:
Alexandre MEYER's avatar
Alexandre MEYER committed
            #print(requests.get(uri).content)
Alexandre MEYER's avatar
Alexandre MEYER committed
            soup = BeautifulSoup(requests.get(uri).content, 'html.parser')
            img_tag = soup.find('img')
            if img_tag and 'src' in img_tag.attrs:
                image_url = img_tag['src']
Alexandre MEYER's avatar
Alexandre MEYER committed
                # Remplacer les caractères spéciaux et les espaces dans le titre
                safe_title = title.replace(' ', '_').replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')
Alexandre MEYER's avatar
Alexandre MEYER committed
                image_path = f"web/content/research/images/{safe_title}.jpg"
Alexandre MEYER's avatar
Alexandre MEYER committed
                download_image(image_url, image_path)

Alexandre MEYER's avatar
Alexandre MEYER committed
        # Ajouter une section pour chaque année
        if year != current_year:
            if current_year is not None:
                html_content += "</ul></div>"
            current_year = year
Alexandre MEYER's avatar
Alexandre MEYER committed
            html_content += f"<div class='custom-year-section'><h2>{year}</h2><ul>"
Alexandre MEYER's avatar
Alexandre MEYER committed

Alexandre MEYER's avatar
Alexandre MEYER committed
        # # Construire la référence complète de la publication
        # reference_parts = [title]
        # if authors:
        #     reference_parts.append(authors)        
        # if doc_type == 'HDR':
        #     reference_parts.append("HDR")
        # if doc_type == 'REPORT':
        #     reference_parts.append("Research Report")
        # if doc_type == 'THESE':
        #     reference_parts.append("Thesis")
        # if doc_type == 'UNDEFINED':
        #     reference_parts.append("Code")
        # print(doc_type)
        # if conference or journal:
        #     reference_parts.append(f"{conference}, {journal}, {year}")
        # reference_text = '<br>'.join(part for part in reference_parts if part)


Alexandre MEYER's avatar
Alexandre MEYER committed
        # Construire la référence complète de la publication
        reference_parts = [title]
        if authors:
            reference_parts.append(authors)
Alexandre MEYER's avatar
Alexandre MEYER committed
        
        if doc_type == 'HDR':
Alexandre MEYER's avatar
Alexandre MEYER committed
            pubtype = "HDR"
        elif doc_type == 'REPORT':
            pubtype = "Research Report"
        elif doc_type == 'THESE':
            pubtype = "Thesis"
        elif doc_type == 'UNDEFINED':
            pubtype = "Pre-publication"
            print(doc_type + " title=" + title)
Alexandre MEYER's avatar
Alexandre MEYER committed
        elif doc_type == 'COUV':
            pubtype = "Book chapter"
        elif doc_type == 'SOFTWARE':
            pubtype = "Software"
Alexandre MEYER's avatar
Alexandre MEYER committed
        elif conference or journal:
            pubtype = conference if conference else journal
        else:
            pubtype = "Other"
            print(doc_type + " title=" + title)
Alexandre MEYER's avatar
Alexandre MEYER committed

        # Ajouter la publication à la section de l'année correspondante
Alexandre MEYER's avatar
Alexandre MEYER committed
        html_content += f"""
Alexandre MEYER's avatar
Alexandre MEYER committed
            <li class="custom-publication">
Alexandre MEYER's avatar
Alexandre MEYER committed
                <custom-p><strong>{title}</strong></custom-p>
                <custom-p>{authors}</custom-p>
                <custom-p>{pubtype}, {year}</custom-p>
                <custom-p><a href="{uri}">Lien vers HAL</a> {f'<a href="https://doi.org/{doi}">, Lien DOI</a>' if doi else ''}</custom-p>
Alexandre MEYER's avatar
Alexandre MEYER committed
            </li>
Alexandre MEYER's avatar
Alexandre MEYER committed
        """

    html_content += """
Alexandre MEYER's avatar
Alexandre MEYER committed
        </ul></div>
Alexandre MEYER's avatar
Alexandre MEYER committed
    </body>
    </html>
    """

Alexandre MEYER's avatar
Alexandre MEYER committed
    with open('web/content/research/publications.html', 'w', encoding='utf-8') as f:
Alexandre MEYER's avatar
Alexandre MEYER committed
        f.write(html_content)

# Exemple d'utilisation
author_id = "alexandre-meyer"
publications = get_publications_from_hal(author_id)

if publications:
Alexandre MEYER's avatar
Alexandre MEYER committed
    create_html(publications)