Wheel = zoom • Drag = pan • Double click = fit
def resolve_document_result(matches: dict[str, bool], expected_count: int) -> tuple[bool, list[str]]:
#Determines final document validity and explains why.
found = [k for k, v in matches.items() if v]
missing = [k for k, v in matches.items() if not v]
is_valid = len(found) == expected_count
reasons = []
if missing:
reasons.append(f"Missing fields: {', '.join(missing)}")
if len(found) > expected_count:
reasons.append("Unexpected extra matches detected")
return is_valid, reasons
import re
import os
import csv
import pdfplumber
csv_path = "./data/input.csv" # input path with data (headers need to be exact)
main_folder = "./data/main_folder" # main folder with subfolders and documents
output_csv = "./data/output.csv" # output path if not exist will be createed -> old will be recreated
if os.path.exists(output_csv):
os.remove(output_csv)
document_keys = []
expected_variables_true = {
"Account": 6,
"TAC": 3,
"Agreement": 4,
}
with open(csv_path, newline="", encoding="utf-8-sig") as csv_file:
reader = csv.DictReader(csv_file, delimiter=",")
if not reader.fieldnames:
raise ValueError("Input CSV has no header row.")
document_keys = [h for h in reader.fieldnames if h not in ("Account", "File", "Result")]
rows_by_account = {str(r["Account"]).strip(): r for r in reader}
def get_expected_count(filename: str):
for doc_type, count in expected_variables_true.items():
if filename.startswith(doc_type):
return doc_type, count
return None, None
def contains_numeric_token(text: str, val: str) -> bool:
if not val or not val.isdigit():
return False
pattern = rf"(? 0
with open(output_csv_path, "a", newline="", encoding="utf-8") as out:
writer = csv.DictWriter(out, fieldnames=fieldnames)
if not file_exists:
writer.writeheader()
row = {"Account": account, "File": file}
row.update(row_data)
writer.writerow(row)
for root, dirs, files in os.walk(main_folder):
current_subfolder = os.path.basename(root)
for f in files:
doc_type, expected = get_expected_count(str(f))
if expected is None:
print("Unknown document:", str(f))
continue
print("document type:", doc_type, "expected amount of matches:", expected)
full_file_path = os.path.join(root, f)
if expected == 0:
print(f"Document {f} has 0 variables to check")
continue
variables = search_in_csv(current_subfolder)
if variables is None:
print(f"Account {current_subfolder} not found in CSV, skipping {f}")
continue
with pdfplumber.open(full_file_path) as pdf:
text = "\n".join(filter(None, (page.extract_text() for page in pdf.pages)))
fieldnames = ["Account", "File"] + document_keys + ["Result"]
row_data = {}
found_variables = 0
for k in document_keys:
val = str(variables.get(k, "")).strip()
if val.isdigit():
ok = contains_numeric_token(text, val)
else:
ok = (val != "" and val in text)
row_data[k] = "TRUE" if ok else "FALSE"
if ok:
found_variables += 1
print(f"In document found {found_variables} variable(s)")
row_data["Result"] = "TRUE" if found_variables == expected else "FALSE"
write_and_save_to_csv(output_csv, current_subfolder, full_file_path, row_data, fieldnames)