XML Invoice Parser

XML Invoice Parser is a desktop application that automatically reads XML invoice files, extracts all relevant financial and supplier information, and stores the results in both Excel and a MySQL database. The tool is designed to support Accounts Payable teams by eliminating manual invoice processing and ensuring clean, structured data for further analysis or reporting.

Python
import pandas as pd 

import os 

import glob 

import re 

from lxml import etree 

from datetime import datetime 

import customtkinter as ctk 

from tkinter import filedialog, messagebox 

import mysql.connector 

from mysql.connector import Error 

 

# CLASS: XML PARSER 

# Responsible for parsing XML invoice files and extracting relevant fields. 

# Handles single file parsing and bulk folder parsing. 

class XMLParser: 

    def __init__(self): 

        self.xml_list = [] 

 

    # Extracts clinic code from the XML tree using regex pattern 

    @staticmethod 

    def extract_clinic_code(tree): 

        xml_string = etree.tostring(tree, encoding='unicode') 

        pattern = re.compile(r'1905\d{2}') 

        match = pattern.search(xml_string) 

        return match.group(0) if match else '' 

 

    # Parses a single XML invoice file and returns a dictionary of extracted data 

    def parse_file(self, file): 

        try: 

            tree = etree.parse(file) 

            root = tree.getroot() 

        except (etree.XMLSyntaxError, UnicodeDecodeError): 

            return None 

 

        clinic = self.extract_clinic_code(tree) 

 

        denominazioni = root.xpath('.//Anagrafica/Denominazione/text()') 

        supplier = denominazioni[0] if len(denominazioni) > 0 else '' 

 

        invoice_number = root.findtext('.//Numero') 

 

        # Clean and format invoice date 

        date_raw = root.findtext('.//Data') 

        invoice_date = '' 

        if date_raw: 

            date_clean = date_raw.split('+')[0] 

            try: 

                invoice_date = datetime.strptime(date_clean, '%Y-%m-%d').strftime('%Y-%m-%d') 

            except: 

                invoice_date = date_raw 

 

        currency = root.findtext('.//Divisa') 

        description = root.findtext('.//DettaglioLinee/Descrizione') 

        total_import = root.findtext('.//ImportoTotaleDocumento') 

        natura = root.findtext('.//DatiRiepilogo/Natura') 

        wht_code = root.findtext('.//TipoRitenuta') 

        wht_amount = root.findtext('.//ImportoRitenuta') 

 

        data = { 

            'Clinic': clinic, 

            'Supplier': supplier, 

            'Invoice_Number': invoice_number, 

            'Invoice_Date': invoice_date, 

            'Currency': currency, 

            'Description': description, 

            'Total_Import': total_import, 

            'Natura': natura, 

            'WHT_code': wht_code, 

            'WHT_amount': wht_amount 

        } 

 

        # Extract VAT data and add as dynamic fields 

        for riepilogo in root.findall(".//DatiRiepilogo"): 

            aliquota = riepilogo.findtext("AliquotaIVA") 

            imponibile = riepilogo.findtext("ImponibileImporto") 

            if aliquota and imponibile: 

                aliquota_clean = aliquota.replace(".00", "") 

                data[f"VAT_{aliquota_clean}"] = aliquota 

                data[f"VAT_{aliquota_clean}_amount"] = imponibile 

 

        return data 

 

    # Parses all XML files in a folder and returns a DataFrame with all extracted data 

    def parse_folder(self, folder_path): 

        self.xml_list.clear() 

        files = glob.glob(os.path.join(folder_path, '*.xml')) 

 

        for file in files: 

            result = self.parse_file(file) 

            if result: 

                self.xml_list.append(result) 

 

        return pd.DataFrame(self.xml_list) 

[Salto de ajuste de texto] 

# CLASS: MYSQL SAVER 

# Responsible for creating MySQL table (if not exists) and saving DataFrame to the database 

class MySQLSaver: 

    def __init__(self, host, user, password, database): 

        self.credentials = { 

            "host": host, 

            "user": user, 

            "password": password, 

            "database": database 

        } 

 

    # Saves the provided DataFrame into the MySQL table `invoice_data` 

    # Creates the table if it does not exist 

    def save(self, df: pd.DataFrame): 

        try: 

            connection = mysql.connector.connect(**self.credentials) 

            cursor = connection.cursor() 

 

            columns_sql = ", ".join([f"`{col}` TEXT" for col in df.columns]) 

            create_query = f""" 

            CREATE TABLE IF NOT EXISTS invoice_data ( 

                id INT AUTO_INCREMENT PRIMARY KEY, 

                {columns_sql} 

            ) 

            """ 

            cursor.execute(create_query) 

 

            insert_sql = f""" 

            INSERT INTO invoice_data ({", ".join(df.columns)}) 

            VALUES ({", ".join(['%s'] * len(df.columns))}) 

            """ 

            cursor.executemany(insert_sql, df.values.tolist()) 

            connection.commit() 

 

            cursor.close() 

            connection.close() 

            return True 

 

        except Error as e: 

            messagebox.showerror("MySQL Error", f"MySQL save failed:\n{e}") 

            return False 

 

# CLASS: GUI APPLICATION 

# Main GUI application connecting XML parser and MySQL saver 

class XMLApp: 

    def __init__(self): 

        self.parser = XMLParser() 

        self.mysql_saver = MySQLSaver( 

            host="localhost", 

            user="root", 

            password="your_password", 

            database="invoices_db" 

        ) 

 

        ctk.set_appearance_mode('dark') 

        ctk.set_default_color_theme('green') 

 

        self.root = ctk.CTk() 

        self.root.geometry("320x270") 

        self.root.title("XML Invoice Parser") 

 

        self.folder_path = None 

 

        ctk.CTkLabel(self.root, text="Select folder with XML invoices:", font=("Segoe UI", 14)).pack(pady=5) 

        ctk.CTkButton(self.root, text="Browse Folder", command=self.browse_folder).pack(pady=5) 

 

        self.status_label = ctk.CTkLabel(self.root, text="", font=("Arial", 12)) 

        self.status_label.pack(pady=5) 

 

        ctk.CTkButton(self.root, text="START", font=("Segoe UI", 14), command=self.process).pack(pady=10) 

 

        self.root.mainloop() 

 

    # Opens a folder dialog to select the folder containing XML files 

    def browse_folder(self): 

        folder = filedialog.askdirectory() 

        if folder: 

            self.folder_path = folder 

            self.status_label.configure(text=f"Selected: {os.path.basename(folder)}") 

 

    # Processes all XML files in the selected folder: 

    # Parses them, saves results to Excel and MySQL, and shows messages 

    def process(self): 

        if not self.folder_path: 

            messagebox.showwarning("Warning", "Please select a folder first.") 

            return 

 

        df = self.parser.parse_folder(self.folder_path) 

        df.to_excel("invoices_output_app.xlsx", index=False) 

 

        mysql_ok = self.mysql_saver.save(df) 

 

        if mysql_ok: 

            messagebox.showinfo("Done", "Excel + MySQL export completed!") 

 

# Instantiates the GUI application and starts event loop 

XMLApp()
© Marta-Kowalczyk.com