extract-data/extract_pdf_files.py

from pathlib import Path
from PyPDF2 import PdfReader
import pymupdf as pm

ROOT_PATH = Path(__file__).parent
DATA_PATH = ROOT_PATH / "data"
OUTPUT_PAHT = ROOT_PATH / "output"


def pdf_is_readable(input_file):
    reader = PdfReader(input_file)
    for page in reader.pages:
        text = page.extract_text()
        if text and text.strip():
            return True
    return False


# def read_pdf_file(input_file):

#     reader = PdfReader(input_file)
#     pages = reader.pages
#     print(len(pages), type(pages))
#     page0 = pages[0]
#     text = page0.extract_text()
#     print(text)


#     with open("output.txt", "w", encoding="utf-8") as file:
#         file.write(text)

def process_one_file(input_file):

    if not pdf_is_readable(input_file):
        return

    docs = pm.open(input_file)

    all_text = ""
    for page in docs:
        text = page.get_text("text")
        all_text += text + "\n"

    return all_text


def process_all_files(input_dir, output_dir: Path):
    output_dir.mkdir(parents=True, exist_ok=True)

    files = {}
    for file in input_dir.iterdir():
        ext = file.suffix.replace(".", "")

        if ext not in files:
            files[ext] = []

        files[ext].append(file)


    for file in files["pdf"]:
        file_text = process_one_file(file)

        output_file = output_dir / f"{file.stem} extracted.txt"

        with open(output_file, "w", encoding="utf-8") as file:
            file.write(file_text)


# src = pm.open("ocr_needed_sample.pdf")
# res = pm.open()

# for page in src:
#     pix = page.get_pixmap()
#     pdfbytes = pix.pdfocr_tobytes(language="eng")
#     imgpdf = pm.open("pdf", pdfbytes)
#     res.insert_pdf(imgpdf)

# res.save("exported-document.pdf")


process_all_files(DATA_PATH, OUTPUT_PAHT)


# file = files["pdf"][7]
# if not pdf_is_readable(file):
#     print("file is not readable")
# # print(pdf_is_readable("ocr_needed_sample.pdf"))
# print(file)
# # read_pdf_file(file)
# all_text = ""
# doc = pm.open(file)
# all_text = ""
# for page in doc:
#     for block in page.get_text("dict")["blocks"]:
#         print(block)
#         print()
#         print()

# # for page in doc:
# #     text = page.get_text("text")
# #     all_text += text + "\n"
# # with open("output.txt", "w", encoding="utf-8") as file:
# #     file.writelines(all_text)

# all_spans = []

# for page in doc:

#     spans = [
#     {
#         "text": span["text"],
#         "flags": span["flags"],
#         "page": page.number + 1
#     }
#     for block in page.get_text("dict")["blocks"] if block.get("")
#     for line in block["lines"]
#     for span in line["spans"]
#     ]

#     all_spans.extend(spans)

# for s in all_spans:
#     if s["flags"] > 4:
#         print(s)


# with open("output.txt", "w", encoding="utf-8") as file:
#     file.writelines(all_text)

# blocks = page.get_text("blocks")  # for larger text blocks

# texts = []
# # Extract detailed info with font
# for block in page.get_text("dict")["blocks"]:
#     for line in block.get("lines", []):
#         for span in line["spans"]:
#             text = span["text"]
#             font = span["font"]       # font name
#             size = span["size"]       # font size
#             flags = span["flags"]
#             texts.append({
#                 "text": text, "font": font, "size": size, "flags": flags
#             })

# for elem in texts:
#     print(elem)