156 lines
3.4 KiB
Python
156 lines
3.4 KiB
Python
from pathlib import Path
|
|
from PyPDF2 import PdfReader
|
|
import pymupdf as pm
|
|
|
|
ROOT_PATH = Path(__file__).parent
|
|
DATA_PATH = ROOT_PATH / "data"
|
|
OUTPUT_PAHT = ROOT_PATH / "output"
|
|
|
|
|
|
|
|
|
|
def pdf_is_readable(input_file):
|
|
reader = PdfReader(input_file)
|
|
for page in reader.pages:
|
|
text = page.extract_text()
|
|
if text and text.strip():
|
|
return True
|
|
return False
|
|
|
|
|
|
# def read_pdf_file(input_file):
|
|
|
|
# reader = PdfReader(input_file)
|
|
# pages = reader.pages
|
|
# print(len(pages), type(pages))
|
|
# page0 = pages[0]
|
|
# text = page0.extract_text()
|
|
# print(text)
|
|
|
|
|
|
# with open("output.txt", "w", encoding="utf-8") as file:
|
|
# file.write(text)
|
|
|
|
def process_one_file(input_file):
|
|
|
|
if not pdf_is_readable(input_file):
|
|
return
|
|
|
|
docs = pm.open(input_file)
|
|
|
|
all_text = ""
|
|
for page in docs:
|
|
text = page.get_text("text")
|
|
all_text += text + "\n"
|
|
|
|
return all_text
|
|
|
|
|
|
|
|
def process_all_files(input_dir, output_dir: Path):
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
files = {}
|
|
for file in input_dir.iterdir():
|
|
ext = file.suffix.replace(".", "")
|
|
|
|
if ext not in files:
|
|
files[ext] = []
|
|
|
|
files[ext].append(file)
|
|
|
|
|
|
for file in files["pdf"]:
|
|
file_text = process_one_file(file)
|
|
|
|
output_file = output_dir / f"{file.stem} extracted.txt"
|
|
|
|
with open(output_file, "w", encoding="utf-8") as file:
|
|
file.write(file_text)
|
|
|
|
|
|
# src = pm.open("ocr_needed_sample.pdf")
|
|
# res = pm.open()
|
|
|
|
# for page in src:
|
|
# pix = page.get_pixmap()
|
|
# pdfbytes = pix.pdfocr_tobytes(language="eng")
|
|
# imgpdf = pm.open("pdf", pdfbytes)
|
|
# res.insert_pdf(imgpdf)
|
|
|
|
# res.save("exported-document.pdf")
|
|
|
|
|
|
|
|
process_all_files(DATA_PATH, OUTPUT_PAHT)
|
|
|
|
|
|
|
|
|
|
|
|
# file = files["pdf"][7]
|
|
# if not pdf_is_readable(file):
|
|
# print("file is not readable")
|
|
# # print(pdf_is_readable("ocr_needed_sample.pdf"))
|
|
# print(file)
|
|
# # read_pdf_file(file)
|
|
# all_text = ""
|
|
# doc = pm.open(file)
|
|
# all_text = ""
|
|
# for page in doc:
|
|
# for block in page.get_text("dict")["blocks"]:
|
|
# print(block)
|
|
# print()
|
|
# print()
|
|
|
|
# # for page in doc:
|
|
# # text = page.get_text("text")
|
|
# # all_text += text + "\n"
|
|
# # with open("output.txt", "w", encoding="utf-8") as file:
|
|
# # file.writelines(all_text)
|
|
|
|
# all_spans = []
|
|
|
|
# for page in doc:
|
|
|
|
# spans = [
|
|
# {
|
|
# "text": span["text"],
|
|
# "flags": span["flags"],
|
|
# "page": page.number + 1
|
|
# }
|
|
# for block in page.get_text("dict")["blocks"] if block.get("")
|
|
# for line in block["lines"]
|
|
# for span in line["spans"]
|
|
# ]
|
|
|
|
# all_spans.extend(spans)
|
|
|
|
# for s in all_spans:
|
|
# if s["flags"] > 4:
|
|
# print(s)
|
|
|
|
|
|
|
|
|
|
# with open("output.txt", "w", encoding="utf-8") as file:
|
|
# file.writelines(all_text)
|
|
|
|
# blocks = page.get_text("blocks") # for larger text blocks
|
|
|
|
# texts = []
|
|
# # Extract detailed info with font
|
|
# for block in page.get_text("dict")["blocks"]:
|
|
# for line in block.get("lines", []):
|
|
# for span in line["spans"]:
|
|
# text = span["text"]
|
|
# font = span["font"] # font name
|
|
# size = span["size"] # font size
|
|
# flags = span["flags"]
|
|
# texts.append({
|
|
# "text": text, "font": font, "size": size, "flags": flags
|
|
# })
|
|
|
|
# for elem in texts:
|
|
# print(elem)
|