extract-data/extract_pdf_files.py

156 lines
3.4 KiB
Python

from pathlib import Path
from PyPDF2 import PdfReader
import pymupdf as pm
ROOT_PATH = Path(__file__).parent
DATA_PATH = ROOT_PATH / "data"
OUTPUT_PAHT = ROOT_PATH / "output"
def pdf_is_readable(input_file):
reader = PdfReader(input_file)
for page in reader.pages:
text = page.extract_text()
if text and text.strip():
return True
return False
# def read_pdf_file(input_file):
# reader = PdfReader(input_file)
# pages = reader.pages
# print(len(pages), type(pages))
# page0 = pages[0]
# text = page0.extract_text()
# print(text)
# with open("output.txt", "w", encoding="utf-8") as file:
# file.write(text)
def process_one_file(input_file):
if not pdf_is_readable(input_file):
return
docs = pm.open(input_file)
all_text = ""
for page in docs:
text = page.get_text("text")
all_text += text + "\n"
return all_text
def process_all_files(input_dir, output_dir: Path):
output_dir.mkdir(parents=True, exist_ok=True)
files = {}
for file in input_dir.iterdir():
ext = file.suffix.replace(".", "")
if ext not in files:
files[ext] = []
files[ext].append(file)
for file in files["pdf"]:
file_text = process_one_file(file)
output_file = output_dir / f"{file.stem} extracted.txt"
with open(output_file, "w", encoding="utf-8") as file:
file.write(file_text)
# src = pm.open("ocr_needed_sample.pdf")
# res = pm.open()
# for page in src:
# pix = page.get_pixmap()
# pdfbytes = pix.pdfocr_tobytes(language="eng")
# imgpdf = pm.open("pdf", pdfbytes)
# res.insert_pdf(imgpdf)
# res.save("exported-document.pdf")
process_all_files(DATA_PATH, OUTPUT_PAHT)
# file = files["pdf"][7]
# if not pdf_is_readable(file):
# print("file is not readable")
# # print(pdf_is_readable("ocr_needed_sample.pdf"))
# print(file)
# # read_pdf_file(file)
# all_text = ""
# doc = pm.open(file)
# all_text = ""
# for page in doc:
# for block in page.get_text("dict")["blocks"]:
# print(block)
# print()
# print()
# # for page in doc:
# # text = page.get_text("text")
# # all_text += text + "\n"
# # with open("output.txt", "w", encoding="utf-8") as file:
# # file.writelines(all_text)
# all_spans = []
# for page in doc:
# spans = [
# {
# "text": span["text"],
# "flags": span["flags"],
# "page": page.number + 1
# }
# for block in page.get_text("dict")["blocks"] if block.get("")
# for line in block["lines"]
# for span in line["spans"]
# ]
# all_spans.extend(spans)
# for s in all_spans:
# if s["flags"] > 4:
# print(s)
# with open("output.txt", "w", encoding="utf-8") as file:
# file.writelines(all_text)
# blocks = page.get_text("blocks") # for larger text blocks
# texts = []
# # Extract detailed info with font
# for block in page.get_text("dict")["blocks"]:
# for line in block.get("lines", []):
# for span in line["spans"]:
# text = span["text"]
# font = span["font"] # font name
# size = span["size"] # font size
# flags = span["flags"]
# texts.append({
# "text": text, "font": font, "size": size, "flags": flags
# })
# for elem in texts:
# print(elem)