Skip to content
Snippets Groups Projects
Commit 6372fdcf authored by AZIZI ANIS p2312052's avatar AZIZI ANIS p2312052
Browse files

script python qui permet de convertir les donnees xml de l'application health...

script python qui permet de convertir les donnees xml de l'application health vers un format csv pour faciliter l'exploitation
parent 8b63ebe5
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Simple Apple Health XML to CSV
==============================
:File: convert.py
:Description: Convert Apple Health "export.xml" file into a csv
:Version: 0.0.2
:Created: 2019-10-04
:Updated: 2023-10-29
:Authors: Jason Meno (jam)
:Dependencies: An export.xml file from Apple Health
:License: BSD-2-Clause
"""
# %% Imports
import os
import pandas as pd
import xml.etree.ElementTree as ET
import datetime as dt
import sys
# %% Function Definitions
def preprocess_to_temp_file(file_path):
"""
The export.xml file is where all your data is, but Apple Health Export has
two main problems that make it difficult to parse:
1. The DTD markup syntax is exported incorrectly by Apple Health for some data types.
2. The invisible character \x0b (sometimes rendered as U+000b) likes to destroy trees. Think of the trees!
Knowing this, we can save the trees and pre-processes the XML data to avoid destruction and ParseErrors.
"""
print("Pre-processing and writing to temporary file...", end="")
sys.stdout.flush()
temp_file_path = "temp_preprocessed_export.xml"
with open(file_path, 'r') as infile, open(temp_file_path, 'w') as outfile:
skip_dtd = False
for line in infile:
if '<!DOCTYPE' in line:
skip_dtd = True
if not skip_dtd:
line = strip_invisible_character(line)
outfile.write(line)
if ']>' in line:
skip_dtd = False
print("done!")
return temp_file_path
def strip_invisible_character(line):
return line.replace("\x0b", "")
def xml_to_csv(file_path):
"""Loops through the element tree, retrieving all objects, and then
combining them together into a dataframe
"""
print("Converting XML File to CSV...", end="")
sys.stdout.flush()
attribute_list = []
for event, elem in ET.iterparse(file_path, events=('end',)):
if event == 'end':
child_attrib = elem.attrib
for metadata_entry in list(elem):
metadata_values = list(metadata_entry.attrib.values())
if len(metadata_values) == 2:
metadata_dict = {metadata_values[0]: metadata_values[1]}
child_attrib.update(metadata_dict)
attribute_list.append(child_attrib)
# Clear the element from memory to avoid excessive memory consumption
elem.clear()
health_df = pd.DataFrame(attribute_list)
# Every health data type and some columns have a long identifer
# Removing these for readability
health_df.type = health_df.type.str.replace('HKQuantityTypeIdentifier', "")
health_df.type = health_df.type.str.replace('HKCategoryTypeIdentifier', "")
health_df.columns = \
health_df.columns.str.replace("HKCharacteristicTypeIdentifier", "")
# Reorder some of the columns for easier visual data review
original_cols = list(health_df)
shifted_cols = ['type',
'sourceName',
'value',
'unit',
'startDate',
'endDate',
'creationDate']
# Add loop specific column ordering if metadata entries exist
if 'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate' in original_cols:
shifted_cols.append(
'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate')
if 'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate' in original_cols:
shifted_cols.append(
'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate')
if 'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes' in original_cols:
shifted_cols.append(
'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes')
remaining_cols = list(set(original_cols) - set(shifted_cols))
reordered_cols = shifted_cols + remaining_cols
health_df = health_df.reindex(labels=reordered_cols, axis='columns')
# Sort by newest data first
health_df.sort_values(by='startDate', ascending=False, inplace=True)
print("done!")
return health_df
def save_to_csv(health_df):
print("Saving CSV file...", end="")
sys.stdout.flush()
today = dt.datetime.now().strftime('%Y-%m-%d')
health_df.to_csv("apple_health_export_" + today + ".csv", index=False)
print("done!")
return
def remove_temp_file(temp_file_path):
print("Removing temporary file...", end="")
os.remove(temp_file_path)
print("done!")
return
def main():
file_path = "apple_health_export/export.xml"
temp_file_path = preprocess_to_temp_file(file_path)
health_df = xml_to_csv(temp_file_path)
save_to_csv(health_df)
remove_temp_file(temp_file_path)
return
# %%
if __name__ == '__main__':
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment