From 6372fdcf6780b45e58b7ec640e20e3b5bb5668bc Mon Sep 17 00:00:00 2001
From: AZIZI ANIS p2312052 <anis.azizi@etu.univ-lyon1.fr>
Date: Tue, 10 Dec 2024 09:10:26 +0000
Subject: [PATCH] script python qui permet de convertir les donnees xml de
 l'application health vers un format csv pour faciliter l'exploitation

---
 Utils/xmltocsv.py | 154 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 Utils/xmltocsv.py

diff --git a/Utils/xmltocsv.py b/Utils/xmltocsv.py
new file mode 100644
index 0000000..a9861b6
--- /dev/null
+++ b/Utils/xmltocsv.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Simple Apple Health XML to CSV
+==============================
+:File: convert.py
+:Description: Convert Apple Health "export.xml" file into a csv
+:Version: 0.0.2
+:Created: 2019-10-04
+:Updated: 2023-10-29
+:Authors: Jason Meno (jam)
+:Dependencies: An export.xml file from Apple Health
+:License: BSD-2-Clause
+"""
+
+# %% Imports
+import os
+import pandas as pd
+import xml.etree.ElementTree as ET
+import datetime as dt
+import sys
+
+
+# %% Function Definitions
+
+def preprocess_to_temp_file(file_path):
+    """
+    The export.xml file is where all your data is, but Apple Health Export has
+    two main problems that make it difficult to parse: 
+        1. The DTD markup syntax is exported incorrectly by Apple Health for some data types.
+        2. The invisible character \x0b (sometimes rendered as U+000b) likes to destroy trees. Think of the trees!
+
+    Knowing this, we can save the trees and pre-processes the XML data to avoid destruction and ParseErrors.
+    """
+
+    print("Pre-processing and writing to temporary file...", end="")
+    sys.stdout.flush()
+
+    temp_file_path = "temp_preprocessed_export.xml"
+    with open(file_path, 'r') as infile, open(temp_file_path, 'w') as outfile:
+        skip_dtd = False
+        for line in infile:
+            if '<!DOCTYPE' in line:
+                skip_dtd = True
+            if not skip_dtd:
+                line = strip_invisible_character(line)
+                outfile.write(line)
+            if ']>' in line:
+                skip_dtd = False
+
+    print("done!")
+    return temp_file_path
+
+def strip_invisible_character(line):
+    return line.replace("\x0b", "")
+
+
+def xml_to_csv(file_path):
+    """Loops through the element tree, retrieving all objects, and then
+    combining them together into a dataframe
+    """
+
+    print("Converting XML File to CSV...", end="")
+    sys.stdout.flush()
+
+    attribute_list = []
+
+    for event, elem in ET.iterparse(file_path, events=('end',)):
+        if event == 'end':
+            child_attrib = elem.attrib
+            for metadata_entry in list(elem):
+                metadata_values = list(metadata_entry.attrib.values())
+                if len(metadata_values) == 2:
+                    metadata_dict = {metadata_values[0]: metadata_values[1]}
+                    child_attrib.update(metadata_dict)
+            attribute_list.append(child_attrib)
+
+            # Clear the element from memory to avoid excessive memory consumption
+            elem.clear()
+
+    health_df = pd.DataFrame(attribute_list)
+
+    # Every health data type and some columns have a long identifer
+    # Removing these for readability
+    health_df.type = health_df.type.str.replace('HKQuantityTypeIdentifier', "")
+    health_df.type = health_df.type.str.replace('HKCategoryTypeIdentifier', "")
+    health_df.columns = \
+        health_df.columns.str.replace("HKCharacteristicTypeIdentifier", "")
+
+    # Reorder some of the columns for easier visual data review
+    original_cols = list(health_df)
+    shifted_cols = ['type',
+                    'sourceName',
+                    'value',
+                    'unit',
+                    'startDate',
+                    'endDate',
+                    'creationDate']
+
+    # Add loop specific column ordering if metadata entries exist
+    if 'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate' in original_cols:
+        shifted_cols.append(
+            'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate')
+
+    if 'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate' in original_cols:
+        shifted_cols.append(
+            'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate')
+
+    if 'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes' in original_cols:
+        shifted_cols.append(
+            'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes')
+
+    remaining_cols = list(set(original_cols) - set(shifted_cols))
+    reordered_cols = shifted_cols + remaining_cols
+    health_df = health_df.reindex(labels=reordered_cols, axis='columns')
+
+    # Sort by newest data first
+    health_df.sort_values(by='startDate', ascending=False, inplace=True)
+
+    print("done!")
+
+    return health_df
+
+
+def save_to_csv(health_df):
+    print("Saving CSV file...", end="")
+    sys.stdout.flush()
+
+    today = dt.datetime.now().strftime('%Y-%m-%d')
+    health_df.to_csv("apple_health_export_" + today + ".csv", index=False)
+    print("done!")
+
+    return
+
+def remove_temp_file(temp_file_path):
+    print("Removing temporary file...", end="")
+    os.remove(temp_file_path)
+    print("done!")
+    
+    return
+
+def main():
+    file_path = "apple_health_export/export.xml"
+    temp_file_path = preprocess_to_temp_file(file_path)
+    health_df = xml_to_csv(temp_file_path)
+    save_to_csv(health_df)
+    remove_temp_file(temp_file_path)
+
+    return
+
+
+# %%
+if __name__ == '__main__':
+    main()
\ No newline at end of file
-- 
GitLab