Project2 • Mind map

Wheel = zoom • Drag = pan • Double click = fit

Code (Python) • read-only


def resolve_document_result(matches: dict[str, bool], expected_count: int) -> tuple[bool, list[str]]:
    #Determines final document validity and explains why.
    found = [k for k, v in matches.items() if v]
    missing = [k for k, v in matches.items() if not v]

    is_valid = len(found) == expected_count

    reasons = []
    if missing:
        reasons.append(f"Missing fields: {', '.join(missing)}")
    if len(found) > expected_count:
        reasons.append("Unexpected extra matches detected")

    return is_valid, reasons

Code (Python) • read-only

                
import re
import os
import csv
import pdfplumber

csv_path = "./data/input.csv"          # input path with data (headers need to be exact)
main_folder = "./data/main_folder"     # main folder with subfolders and documents
output_csv = "./data/output.csv"       # output path if not exist will be createed -> old will be recreated

if os.path.exists(output_csv):
    os.remove(output_csv)

document_keys = []

expected_variables_true = {
    "Account": 6,
    "TAC": 3,
    "Agreement": 4,
}

with open(csv_path, newline="", encoding="utf-8-sig") as csv_file:
    reader = csv.DictReader(csv_file, delimiter=",")
    if not reader.fieldnames:
        raise ValueError("Input CSV has no header row.")
    document_keys = [h for h in reader.fieldnames if h not in ("Account", "File", "Result")]
    rows_by_account = {str(r["Account"]).strip(): r for r in reader}

def get_expected_count(filename: str):
    for doc_type, count in expected_variables_true.items():
        if filename.startswith(doc_type):
            return doc_type, count
    return None, None

def contains_numeric_token(text: str, val: str) -> bool:
    if not val or not val.isdigit():
        return False
    pattern = rf"(? 0

    with open(output_csv_path, "a", newline="", encoding="utf-8") as out:
        writer = csv.DictWriter(out, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()

        row = {"Account": account, "File": file}
        row.update(row_data)
        writer.writerow(row)


for root, dirs, files in os.walk(main_folder):
    current_subfolder = os.path.basename(root)

    for f in files:
        doc_type, expected = get_expected_count(str(f))
        if expected is None:
            print("Unknown document:", str(f))
            continue

        print("document type:", doc_type, "expected amount of matches:", expected)

        full_file_path = os.path.join(root, f)

        if expected == 0:
            print(f"Document {f} has 0 variables to check")
            continue

        variables = search_in_csv(current_subfolder)
        if variables is None:
            print(f"Account {current_subfolder} not found in CSV, skipping {f}")
            continue

        with pdfplumber.open(full_file_path) as pdf:
            text = "\n".join(filter(None, (page.extract_text() for page in pdf.pages)))

        fieldnames = ["Account", "File"] + document_keys + ["Result"]
        row_data = {}
        found_variables = 0

        for k in document_keys:
            val = str(variables.get(k, "")).strip()
            if val.isdigit():
                ok = contains_numeric_token(text, val)
            else:
                ok = (val != "" and val in text)
            row_data[k] = "TRUE" if ok else "FALSE"
            if ok:
                found_variables += 1

        print(f"In document found {found_variables} variable(s)")
        row_data["Result"] = "TRUE" if found_variables == expected else "FALSE"
        
        write_and_save_to_csv(output_csv, current_subfolder, full_file_path, row_data, fieldnames)