def _purge(*prefixes):
for identify in (m for m in record(sys.modules)
if any(m == p or m.startswith(p + ".") for p in prefixes)):
del sys.modules(identify)
def _load_ocrmypdf():
_purge("PIL", "ocrmypdf")
import ocrmypdf
return ocrmypdf
strive:
ocrmypdf = _load_ocrmypdf()
besides ImportError as e:
if "_Ink" in str(e) or "PIL" in str(e):
print("Repairing an incompatible Pillow (reinstalling pillow<12)...")
sh(f'"{sys.executable}" -m pip set up -q --force-reinstall "pillow<12"')
strive:
ocrmypdf = _load_ocrmypdf()
print("Pillow repaired — persevering with with out a restart.")
besides Exception:
elevate RuntimeError(
"Pillow continues to be incompatible on this session. Use the Colab menu: "
"Runtime > Restart session, then run this cell once more."
)
else:
elevate
from ocrmypdf.exceptions import (
ExitCode,
PriorOcrFoundError,
EncryptedPdfError,
MissingDependencyError,
TaggedPDFError,
DigitalSignatureError,
DpiError,
InputFileError,
UnsupportedImageFormatError,
)
from ocrmypdf.helpers import check_pdf
from ocrmypdf.pdfa import file_claims_pdfa
import img2pdf
from PIL import Picture, ImageDraw, ImageFont, ImageFilter
logging.basicConfig(degree=logging.WARNING, format="%(levelname)s: %(message)s")
logging.getLogger("ocrmypdf").setLevel(logging.WARNING)
logging.getLogger("pdfminer").setLevel(logging.ERROR)
logging.getLogger("PIL").setLevel(logging.WARNING)
SAMPLE_TEXT_PAGES = (
"Optical Character Recognition, generally abbreviated as OCR, is the "
"technique of changing photos of typed or printed textual content into machine "
"encoded textual content. This web page was generated as an artificial scan in order that the "
"OCRmyPDF pipeline has one thing real looking to acknowledge and search.",
"On 14 March 2026 the archive contained 1,482 pages throughout 37 folders. "
"Roughly 92 p.c of these pages had been scanned at 200 to 300 dots per "
"inch. The remaining 8 p.c had been skewed and required deskewing earlier than "
"any dependable recognition was doable.",
"After OCRmyPDF finishes, the output is a searchable PDF/A file. You possibly can "
"choose textual content, copy it, and run full textual content search throughout 1000's of "
"paperwork. The unique picture decision is preserved whereas a hidden "
"textual content layer is positioned precisely beneath the web page picture.",
)
def _find_font():
for cand in (
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/usr/share/fonts/truetype/liberation/LiberationSans-Common.ttf",
):
if os.path.exists(cand):
return cand
return None
_FONT_PATH = _find_font()
FONT = ImageFont.truetype(_FONT_PATH, 40) if _FONT_PATH else ImageFont.load_default()
def _add_speckle(img, n=6000, darkish=60):
"""Sprinkle gentle darkish specks to mimic scanner noise (motivates --clean)."""
import random
px = img.load()
w, h = img.measurement
for _ in vary(n):
px(random.randint(0, w - 1), random.randint(0, h - 1)) = random.randint(0, darkish)
return img
def render_page(textual content, skew=False):
"""Render one A4 web page (1654x2339 px ≈ 200 DPI) of darkish textual content on white."""
W, H = 1654, 2339
img = Picture.new("L", (W, H), 255)
draw = ImageDraw.Draw(img)
draw.multiline_text((150, 180), textwrap.fill(textual content, width=58),
fill=25, font=FONT, spacing=18)
if skew:
img = img.rotate(6, resample=Picture.BICUBIC, broaden=False, fillcolor=255)
img = img.filter(ImageFilter.GaussianBlur(0.6))
img = _add_speckle(img)
return img
def build_scanned_pdf(pdf_path: Path, pages_text, skew_index=1):
"""Render pages to PNGs and wrap them losslessly into an image-only PDF."""
pngs = ()
for i, textual content in enumerate(pages_text):
img = render_page(textual content, skew=(i == skew_index))
p = pdf_path.guardian / f"_pg_{pdf_path.stem}_{i}.png"
img.save(p, format="PNG", dpi=(200, 200))
pngs.append(str(p))
with open(pdf_path, "wb") as f:
f.write(img2pdf.convert(pngs))
for p in pngs:
os.take away(p)
return pdf_path
def do_ocr(input_file, output_file, **kw):
"""Wrapper round ocrmypdf.ocr() that disables the progress bar and instances it."""
kw.setdefault("progress_bar", False)
t0 = time.perf_counter()
rc = ocrmypdf.ocr(input_file, output_file, **kw)
return rc, time.perf_counter() - t0
def tokens(s: str):
return re.findall(r"(a-z0-9)+", s.decrease())
def kb(path) -> str:
return f"{Path(path).stat().st_size / 1024:,.1f} KB"
def banner(title: str):
line = "─" * 74
print(f"n{line}n {title}n{line}")
