Potential Duplicates Check
The Potential Duplicate Invoices tool is designed to help Accounts Payable teams identify and flag invoices that may have been recorded more than once. By analyzing invoice numbers, dates, supplier names, legal entities and amounts, the tool highlights potential duplicates, reducing errors and preventing unnecessary payments. This solution was implemented as client’s Business Control.
Python
from datetime import datetime
import pandas as pd
from thefuzz import fuzz, process
import glob
import os
import re
folder_path = ("xxxxxxxxxxxxxxxxxx")
keyword = "Supplier Invoice Details"
# Searching for all Excel files that contain the keyword in their name
file_paths = glob.glob(os.path.join(folder_path, f"*{keyword}*.xlsx"))
dataframes = []
# Looping through a list of file paths, attempts to load each CSV file into a pandas DataFrame,
# stores the DataFrames in a list, and handles errors if a file cannot be loaded.
for file_path in file_paths:
try:
df = pd.read_excel(file_path, header =0)
dataframes.append(df)
print(f'Loaded file: {file_path}')
except Exception as e:
print(f'Error loading file {file_path}: {e}')
# Combining all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)
# Filtering out rows where 'Invoice Amount' is not equal to 0.0 (it means that the invoice was cancelled in the ERP.)
combined_df = combined_df[combined_df['Invoice Amount'] != 0.0]
# FUNCTIONS
# Prefiltering the data for Incorrect Supplier Check
def pre_filter_supplier(combined_df):
return combined_df[combined_df.duplicated(subset=['Invoice Date', 'Legal Entity Name', 'Invoice Amount', 'Business Unit Name'], keep=False)]
# Prefiltering the data for Incorrect Legal Entity Check
def pre_filter_le(combined_df):
return combined_df[combined_df.duplicated(subset=['Supplier or Party Name', 'Invoice Date', 'Invoice Amount', 'Business Unit Name'], keep=False)]
# Normalizing the invoice number: converting to uppercase and removing special characters
def normalize_invoice_number(number):
return re.sub(r'\W+', '', str(number)).upper()
# Function that compares 2 invoice numbers. Threshold may be adjusted (95 = 95% of similarity)
def is_similar(s1, s2, threshold=95):
s1_normalized = normalize_invoice_number(s1)
s2_normalized = normalize_invoice_number(s2)
return fuzz.ratio(s1_normalized, s2_normalized) >= threshold
# Detects potential duplicate invoices across different suppliers by comparing similar invoice numbers within groups defined by invoice date, legal entity, invoice amount, and business unit. Ignoring Supplier name - This helps identify potential duplicate or suspicious invoices that share the same key attributes but were posted as from different vendors.
def find_potential_duplicates_supplier(combined_df):
combined_df['Invoice Number'] = combined_df['Invoice Number'].astype(str)
grouped = combined_df.groupby(['Invoice Date', 'Legal Entity Name', 'Invoice Amount', 'Business Unit Name'])
duplicates = []
for _, group in grouped:
invoice_numbers = group['Invoice Number'].tolist()
business_unit_names = group['Business Unit Name'].tolist()
supplier_names = group['Supplier or Party Name'].tolist()
for i in range(len(invoice_numbers)):
for j in range(i + 1, len(invoice_numbers)):
if supplier_names[i] != supplier_names[j] and is_similar(invoice_numbers[i], invoice_numbers[j]):
duplicates.append({
'Invoice Number 1': invoice_numbers[i],
'Invoice Number 2': invoice_numbers[j],
'Supplier 1': supplier_names[i],
'Supplier 2': supplier_names[j],
'Business Unit Name': business_unit_names[i]
})
return pd.DataFrame(duplicates)
# Detects potential duplicate invoices across different suppliers by comparing similar invoice numbers within groups defined by invoice date, supplier name, invoice amount, and business unit. Ignoring Legal Entity -This helps identify potential duplicate or suspicious invoices that share the same key attributes but were posted on different legal entities.
def find_potential_duplicates_le(combined_df):
combined_df['Invoice Number'] = combined_df['Invoice Number'].astype(str)
grouped = combined_df.groupby(['Supplier or Party Name', 'Invoice Date', 'Invoice Amount', 'Business Unit Name'])
duplicates = []
for _, group in grouped:
invoice_numbers = group['Invoice Number'].tolist()
business_unit_names = group['Business Unit Name'].tolist()
supplier_names = group['Supplier or Party Name'].tolist()
for i in range(len(invoice_numbers)):
for j in range(i + 1, len(invoice_numbers)):
if is_similar(invoice_numbers[i], invoice_numbers[j]):
duplicates.append({
'Invoice Number 1': invoice_numbers[i],
'Invoice Number 2': invoice_numbers[j],
'Supplier 1': supplier_names[i],
'Supplier 2': supplier_names[j],
'Business Unit Name': business_unit_names[i]
})
return pd.DataFrame(duplicates)
filtered_df_supplier = pre_filter_supplier(combined_df)
potential_duplicates_supplier = find_potential_duplicates_supplier(filtered_df_supplier)
filtered_df_le = pre_filter_le(combined_df)
potential_duplicates_le = find_potential_duplicates_le(filtered_df_le)
# Function that saves the results to a history file to avoid detecting the same duplicates in future runs
def save_new_and_history(potential_duplicates, sheet_name, new_file_path, history_file_path):
today = datetime.today().strftime('%Y-%m-%d')
potential_duplicates['Found Date'] = today
if os.path.exists(history_file_path):
try:
history_df = pd.read_excel(history_file_path, sheet_name=sheet_name)
except:
history_df = pd.DataFrame()
else:
history_df = pd.DataFrame()
# Handling new cases
if not history_df.empty:
is_new = ~potential_duplicates[['Invoice Number 1', 'Invoice Number 2']].apply(tuple, axis=1).isin(
history_df[['Invoice Number 1', 'Invoice Number 2']].apply(tuple, axis=1)
)
new_duplicates = potential_duplicates[is_new]
updated_history = pd.concat([history_df, new_duplicates], ignore_index=True)
else:
new_duplicates = potential_duplicates
updated_history = potential_duplicates
# Update of the historical data
if os.path.exists(history_file_path):
with pd.ExcelWriter(
history_file_path,
engine='openpyxl',
mode='a',
if_sheet_exists='replace'
) as writer:
updated_history.to_excel(writer, sheet_name=sheet_name, index=False)
else:
with pd.ExcelWriter(
history_file_path,
engine='openpyxl',
mode='w'
) as writer:
updated_history.to_excel(writer, sheet_name=sheet_name, index=False)
new_duplicates.to_excel(new_file_path, index=False)
return new_duplicates
# Execution of Check 1: ignoring LE
new_LE = save_new_and_history(
potential_duplicates=potential_duplicates_le,
sheet_name="ignoring LE",
new_file_path="potential_duplicates_no_LE.xlsx",
history_file_path="potential_duplicates_history.xlsx"
)
# Execution of Check 2: ignoring supplier name
new_supplier = save_new_and_history(
potential_duplicates=potential_duplicates_supplier,
sheet_name="ignoring supplier name",
new_file_path="potential_duplicates_no_supplier.xlsx",
history_file_path="potential_duplicates_history.xlsx"
)
# LOG
print(f"ignoring LE: {len(new_LE)} new records")
print(f"ignoring supplier name: {len(new_supplier)} new records")
© Marta-Kowalczyk.com