Artificial Intelligence

OCRmyPDF-Tutorial: Konvertieren Sie gescannte Dokumente in durchsuchbare PDF/A-Dateien mit Sidecar-Textextraktion und Stapelverarbeitung

Von admin Juni 28, 2026 #Dokumente, #durchsuchbare, #gescannte, #konvertieren, #MIT, #OCRmyPDFTutorial, #PDFADateien, #SidecarTextextraktion, #Sie, #Stapelverarbeitung, #und

def _purge(*prefixes):
   for identify in (m for m in record(sys.modules)
                if any(m == p or m.startswith(p + ".") for p in prefixes)):
       del sys.modules(identify)
def _load_ocrmypdf():
   _purge("PIL", "ocrmypdf")
   import ocrmypdf
   return ocrmypdf
strive:
   ocrmypdf = _load_ocrmypdf()
besides ImportError as e:
   if "_Ink" in str(e) or "PIL" in str(e):
       print("Repairing an incompatible Pillow (reinstalling pillow<12)...")
       sh(f'"{sys.executable}" -m pip set up -q --force-reinstall "pillow<12"')
       strive:
           ocrmypdf = _load_ocrmypdf()
           print("Pillow repaired — persevering with with out a restart.")
       besides Exception:
           elevate RuntimeError(
               "Pillow continues to be incompatible on this session. Use the Colab menu: "
               "Runtime > Restart session, then run this cell once more."
           )
   else:
       elevate
from ocrmypdf.exceptions import (
   ExitCode,
   PriorOcrFoundError,
   EncryptedPdfError,
   MissingDependencyError,
   TaggedPDFError,
   DigitalSignatureError,
   DpiError,
   InputFileError,
   UnsupportedImageFormatError,
)
from ocrmypdf.helpers import check_pdf
from ocrmypdf.pdfa import file_claims_pdfa
import img2pdf
from PIL import Picture, ImageDraw, ImageFont, ImageFilter
logging.basicConfig(degree=logging.WARNING, format="%(levelname)s: %(message)s")
logging.getLogger("ocrmypdf").setLevel(logging.WARNING)
logging.getLogger("pdfminer").setLevel(logging.ERROR)
logging.getLogger("PIL").setLevel(logging.WARNING)
SAMPLE_TEXT_PAGES = (
   "Optical Character Recognition, generally abbreviated as OCR, is the "
   "technique of changing photos of typed or printed textual content into machine "
   "encoded textual content. This web page was generated as an artificial scan in order that the "
   "OCRmyPDF pipeline has one thing real looking to acknowledge and search.",
   "On 14 March 2026 the archive contained 1,482 pages throughout 37 folders. "
   "Roughly 92 p.c of these pages had been scanned at 200 to 300 dots per "
   "inch. The remaining 8 p.c had been skewed and required deskewing earlier than "
   "any dependable recognition was doable.",
   "After OCRmyPDF finishes, the output is a searchable PDF/A file. You possibly can "
   "choose textual content, copy it, and run full textual content search throughout 1000's of "
   "paperwork. The unique picture decision is preserved whereas a hidden "
   "textual content layer is positioned precisely beneath the web page picture.",
)
def _find_font():
   for cand in (
       "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
       "/usr/share/fonts/truetype/liberation/LiberationSans-Common.ttf",
   ):
       if os.path.exists(cand):
           return cand
   return None
_FONT_PATH = _find_font()
FONT = ImageFont.truetype(_FONT_PATH, 40) if _FONT_PATH else ImageFont.load_default()
def _add_speckle(img, n=6000, darkish=60):
   """Sprinkle gentle darkish specks to mimic scanner noise (motivates --clean)."""
   import random
   px = img.load()
   w, h = img.measurement
   for _ in vary(n):
       px(random.randint(0, w - 1), random.randint(0, h - 1)) = random.randint(0, darkish)
   return img
def render_page(textual content, skew=False):
   """Render one A4 web page (1654x2339 px ≈ 200 DPI) of darkish textual content on white."""
   W, H = 1654, 2339
   img = Picture.new("L", (W, H), 255)
   draw = ImageDraw.Draw(img)
   draw.multiline_text((150, 180), textwrap.fill(textual content, width=58),
                       fill=25, font=FONT, spacing=18)
   if skew:
       img = img.rotate(6, resample=Picture.BICUBIC, broaden=False, fillcolor=255)
       img = img.filter(ImageFilter.GaussianBlur(0.6))
       img = _add_speckle(img)
   return img
def build_scanned_pdf(pdf_path: Path, pages_text, skew_index=1):
   """Render pages to PNGs and wrap them losslessly into an image-only PDF."""
   pngs = ()
   for i, textual content in enumerate(pages_text):
       img = render_page(textual content, skew=(i == skew_index))
       p = pdf_path.guardian / f"_pg_{pdf_path.stem}_{i}.png"
       img.save(p, format="PNG", dpi=(200, 200))
       pngs.append(str(p))
   with open(pdf_path, "wb") as f:
       f.write(img2pdf.convert(pngs))
   for p in pngs:
       os.take away(p)
   return pdf_path
def do_ocr(input_file, output_file, **kw):
   """Wrapper round ocrmypdf.ocr() that disables the progress bar and instances it."""
   kw.setdefault("progress_bar", False)
   t0 = time.perf_counter()
   rc = ocrmypdf.ocr(input_file, output_file, **kw)
   return rc, time.perf_counter() - t0
def tokens(s: str):
   return re.findall(r"(a-z0-9)+", s.decrease())
def kb(path) -> str:
   return f"{Path(path).stat().st_size / 1024:,.1f} KB"
def banner(title: str):
   line = "─" * 74
   print(f"n{line}n  {title}n{line}")

Von admin

Schreibe einen Kommentar Antworten abbrechen

Könnten Ihre KI-Systeme nach dem EU-KI-Gesetz bereits ein hohes Risiko darstellen?

Artificial Intelligence

Kann KI ein Düsentriebwerk bauen? JARVIS Problem testet die Rolle von KI-Copiloten in anspruchsvoller Technik | MIT-Nachrichten

Professor an der College of California, der über Ted spricht, bittet Sexhändler um 3.000.000 US-Greenback, weil er glaubt, dass die Wahrscheinlichkeit, dass er „wichtige Entdeckungen“ in der Telepathie machen wird, bei „50 %“ liegt

Wie Infrastrukturausgaben zu Geschäftseinnahmen werden |