XML Invoice Parser
XML Invoice Parser is a desktop application that automatically reads XML invoice files, extracts all relevant financial and supplier information, and stores the results in both Excel and a MySQL database. The tool is designed to support Accounts Payable teams by eliminating manual invoice processing and ensuring clean, structured data for further analysis or reporting.
Python
import pandas as pd
import os
import glob
import re
from lxml import etree
from datetime import datetime
import customtkinter as ctk
from tkinter import filedialog, messagebox
import mysql.connector
from mysql.connector import Error
# CLASS: XML PARSER
# Responsible for parsing XML invoice files and extracting relevant fields.
# Handles single file parsing and bulk folder parsing.
class XMLParser:
def __init__(self):
self.xml_list = []
# Extracts clinic code from the XML tree using regex pattern
@staticmethod
def extract_clinic_code(tree):
xml_string = etree.tostring(tree, encoding='unicode')
pattern = re.compile(r'1905\d{2}')
match = pattern.search(xml_string)
return match.group(0) if match else ''
# Parses a single XML invoice file and returns a dictionary of extracted data
def parse_file(self, file):
try:
tree = etree.parse(file)
root = tree.getroot()
except (etree.XMLSyntaxError, UnicodeDecodeError):
return None
clinic = self.extract_clinic_code(tree)
denominazioni = root.xpath('.//Anagrafica/Denominazione/text()')
supplier = denominazioni[0] if len(denominazioni) > 0 else ''
invoice_number = root.findtext('.//Numero')
# Clean and format invoice date
date_raw = root.findtext('.//Data')
invoice_date = ''
if date_raw:
date_clean = date_raw.split('+')[0]
try:
invoice_date = datetime.strptime(date_clean, '%Y-%m-%d').strftime('%Y-%m-%d')
except:
invoice_date = date_raw
currency = root.findtext('.//Divisa')
description = root.findtext('.//DettaglioLinee/Descrizione')
total_import = root.findtext('.//ImportoTotaleDocumento')
natura = root.findtext('.//DatiRiepilogo/Natura')
wht_code = root.findtext('.//TipoRitenuta')
wht_amount = root.findtext('.//ImportoRitenuta')
data = {
'Clinic': clinic,
'Supplier': supplier,
'Invoice_Number': invoice_number,
'Invoice_Date': invoice_date,
'Currency': currency,
'Description': description,
'Total_Import': total_import,
'Natura': natura,
'WHT_code': wht_code,
'WHT_amount': wht_amount
}
# Extract VAT data and add as dynamic fields
for riepilogo in root.findall(".//DatiRiepilogo"):
aliquota = riepilogo.findtext("AliquotaIVA")
imponibile = riepilogo.findtext("ImponibileImporto")
if aliquota and imponibile:
aliquota_clean = aliquota.replace(".00", "")
data[f"VAT_{aliquota_clean}"] = aliquota
data[f"VAT_{aliquota_clean}_amount"] = imponibile
return data
# Parses all XML files in a folder and returns a DataFrame with all extracted data
def parse_folder(self, folder_path):
self.xml_list.clear()
files = glob.glob(os.path.join(folder_path, '*.xml'))
for file in files:
result = self.parse_file(file)
if result:
self.xml_list.append(result)
return pd.DataFrame(self.xml_list)
[Salto de ajuste de texto]
# CLASS: MYSQL SAVER
# Responsible for creating MySQL table (if not exists) and saving DataFrame to the database
class MySQLSaver:
def __init__(self, host, user, password, database):
self.credentials = {
"host": host,
"user": user,
"password": password,
"database": database
}
# Saves the provided DataFrame into the MySQL table `invoice_data`
# Creates the table if it does not exist
def save(self, df: pd.DataFrame):
try:
connection = mysql.connector.connect(**self.credentials)
cursor = connection.cursor()
columns_sql = ", ".join([f"`{col}` TEXT" for col in df.columns])
create_query = f"""
CREATE TABLE IF NOT EXISTS invoice_data (
id INT AUTO_INCREMENT PRIMARY KEY,
{columns_sql}
)
"""
cursor.execute(create_query)
insert_sql = f"""
INSERT INTO invoice_data ({", ".join(df.columns)})
VALUES ({", ".join(['%s'] * len(df.columns))})
"""
cursor.executemany(insert_sql, df.values.tolist())
connection.commit()
cursor.close()
connection.close()
return True
except Error as e:
messagebox.showerror("MySQL Error", f"MySQL save failed:\n{e}")
return False
# CLASS: GUI APPLICATION
# Main GUI application connecting XML parser and MySQL saver
class XMLApp:
def __init__(self):
self.parser = XMLParser()
self.mysql_saver = MySQLSaver(
host="localhost",
user="root",
password="your_password",
database="invoices_db"
)
ctk.set_appearance_mode('dark')
ctk.set_default_color_theme('green')
self.root = ctk.CTk()
self.root.geometry("320x270")
self.root.title("XML Invoice Parser")
self.folder_path = None
ctk.CTkLabel(self.root, text="Select folder with XML invoices:", font=("Segoe UI", 14)).pack(pady=5)
ctk.CTkButton(self.root, text="Browse Folder", command=self.browse_folder).pack(pady=5)
self.status_label = ctk.CTkLabel(self.root, text="", font=("Arial", 12))
self.status_label.pack(pady=5)
ctk.CTkButton(self.root, text="START", font=("Segoe UI", 14), command=self.process).pack(pady=10)
self.root.mainloop()
# Opens a folder dialog to select the folder containing XML files
def browse_folder(self):
folder = filedialog.askdirectory()
if folder:
self.folder_path = folder
self.status_label.configure(text=f"Selected: {os.path.basename(folder)}")
# Processes all XML files in the selected folder:
# Parses them, saves results to Excel and MySQL, and shows messages
def process(self):
if not self.folder_path:
messagebox.showwarning("Warning", "Please select a folder first.")
return
df = self.parser.parse_folder(self.folder_path)
df.to_excel("invoices_output_app.xlsx", index=False)
mysql_ok = self.mysql_saver.save(df)
if mysql_ok:
messagebox.showinfo("Done", "Excel + MySQL export completed!")
# Instantiates the GUI application and starts event loop
XMLApp()© Marta-Kowalczyk.com